diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -99,10 +99,10 @@ SGPR, PrologEpilogSGPRSaveRestoreInfo( SGPRSaveKind::SPILL_TO_VGPR_LANE, FI)); - LLVM_DEBUG( - auto Spill = MFI->getPrologEpilogSGPRSpillToVGPRLanes(FI).front(); - dbgs() << printReg(SGPR, TRI) << " requires fallback spill to " - << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane << '\n';); + LLVM_DEBUG(auto Spill = MFI->getSGPRSpillToPhysicalVGPRLanes(FI).front(); + dbgs() << printReg(SGPR, TRI) << " requires fallback spill to " + << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane + << '\n';); } else { // Remove dead index MF.getFrameInfo().RemoveStackObject(FI); @@ -264,7 +264,7 @@ assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); ArrayRef Spill = - FuncInfo->getPrologEpilogSGPRSpillToVGPRLanes(FI); + FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FI); assert(Spill.size() == NumSubRegs); for (unsigned I = 0; I < NumSubRegs; ++I) { @@ -309,7 +309,7 @@ void restoreFromVGPRLane(const int FI) { assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); ArrayRef Spill = - FuncInfo->getPrologEpilogSGPRSpillToVGPRLanes(FI); + FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FI); assert(Spill.size() == NumSubRegs); for (unsigned I = 0; I < NumSubRegs; ++I) { @@ -1353,8 +1353,8 @@ if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI, TRI->isAGPR(MRI, VReg))) { assert(RS != nullptr); - // FIXME: change to enterBasicBlockEnd() - RS->enterBasicBlock(MBB); + RS->enterBasicBlockEnd(MBB); + RS->backward(MI); TRI->eliminateFrameIndex(MI, 0, FIOp, RS); SpillFIs.set(FI); continue; diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp --- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp @@ -50,7 +50,9 @@ SILowerSGPRSpills() : MachineFunctionPass(ID) {} void calculateSaveRestoreBlocks(MachineFunction &MF); - bool spillCalleeSavedRegs(MachineFunction &MF); + bool spillCalleeSavedRegs(MachineFunction &MF, + SmallVectorImpl &CalleeSavedFIs); + void extendWWMVirtRegLiveness(MachineFunction &MF, LiveIntervals *LIS); bool runOnMachineFunction(MachineFunction &MF) override; @@ -58,6 +60,13 @@ AU.setPreservesAll(); MachineFunctionPass::getAnalysisUsage(AU); } + + MachineFunctionProperties getClearedProperties() const override { + // SILowerSGPRSpills introduces new Virtual VGPRs for spilling SGPRs. + return MachineFunctionProperties() + .set(MachineFunctionProperties::Property::IsSSA) + .set(MachineFunctionProperties::Property::NoVRegs); + } }; } // end anonymous namespace @@ -197,7 +206,8 @@ EntryBB.sortUniqueLiveIns(); } -bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) { +bool SILowerSGPRSpills::spillCalleeSavedRegs( + MachineFunction &MF, SmallVectorImpl &CalleeSavedFIs) { MachineRegisterInfo &MRI = MF.getRegInfo(); const Function &F = MF.getFunction(); const GCNSubtarget &ST = MF.getSubtarget(); @@ -228,6 +238,7 @@ TRI->getSpillAlign(*RC), true); CSI.push_back(CalleeSavedInfo(Reg, JunkFI)); + CalleeSavedFIs.push_back(JunkFI); } } @@ -248,6 +259,50 @@ return false; } +void SILowerSGPRSpills::extendWWMVirtRegLiveness(MachineFunction &MF, + LiveIntervals *LIS) { + // TODO: This is a workaround to avoid the unmodelled liveness computed with + // whole-wave virtual registers when allocated together with the regular VGPR + // virtual registers. Presently, the liveness computed during the regalloc is + // only uniform (or single lane aware) and it doesn't take account of the + // divergent control flow that exists for our GPUs. Since the WWM registers + // can modify inactive lanes, the wave-aware liveness should be computed for + // the virtual registers to accurately plot their interferences. Without + // having the divergent CFG for the function, it is difficult to implement the + // wave-aware liveness info. Until then, we conservatively extend the liveness + // of the wwm registers into the entire function so that they won't be reused + // without first spilling/splitting their liveranges. + SIMachineFunctionInfo *MFI = MF.getInfo(); + + // Insert the IMPLICIT_DEF for the wwm-registers in the entry blocks. + for (auto Reg : MFI->getSGPRSpillVGPRs()) { + for (MachineBasicBlock *SaveBlock : SaveBlocks) { + MachineBasicBlock::iterator InsertBefore = SaveBlock->begin(); + auto MIB = BuildMI(*SaveBlock, *InsertBefore, InsertBefore->getDebugLoc(), + TII->get(AMDGPU::IMPLICIT_DEF), Reg); + MFI->setFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG); + if (LIS) { + LIS->InsertMachineInstrInMaps(*MIB); + } + } + } + + // Insert the KILL in the return blocks to extend their liveness untill the + // end of function. Insert a separate KILL for each VGPR. + for (MachineBasicBlock *RestoreBlock : RestoreBlocks) { + MachineBasicBlock::iterator InsertBefore = + RestoreBlock->getFirstTerminator(); + for (auto Reg : MFI->getSGPRSpillVGPRs()) { + auto MIB = + BuildMI(*RestoreBlock, *InsertBefore, InsertBefore->getDebugLoc(), + TII->get(TargetOpcode::KILL)); + MIB.addReg(Reg); + if (LIS) + LIS->InsertMachineInstrInMaps(*MIB); + } + } +} + bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget(); TII = ST.getInstrInfo(); @@ -261,7 +316,8 @@ // First, expose any CSR SGPR spills. This is mostly the same as what PEI // does, but somewhat simpler. calculateSaveRestoreBlocks(MF); - bool HasCSRs = spillCalleeSavedRegs(MF); + SmallVector CalleeSavedFIs; + bool HasCSRs = spillCalleeSavedRegs(MF, CalleeSavedFIs); MachineFrameInfo &MFI = MF.getFrameInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -275,6 +331,7 @@ bool MadeChange = false; bool NewReservedRegs = false; + bool SpilledToVirtVGPRLanes = false; // TODO: CSR VGPRs will never be spilled to AGPRs. These can probably be // handled as SpilledToReg in regular PrologEpilogInserter. @@ -297,23 +354,53 @@ int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex(); assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); - if (FuncInfo->allocateSGPRSpillToVGPRLane(MF, FI)) { - NewReservedRegs = true; - bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex( - MI, FI, nullptr, Indexes, LIS); - (void)Spilled; - assert(Spilled && "failed to spill SGPR to VGPR when allocated"); - SpillFIs.set(FI); + + bool IsCalleeSaveSGPRSpill = + std::find(CalleeSavedFIs.begin(), CalleeSavedFIs.end(), FI) != + CalleeSavedFIs.end(); + if (IsCalleeSaveSGPRSpill) { + // Spill callee-saved SGPRs into physical VGPR lanes. + + // TODO: This is to ensure the CFIs are static for efficient frame + // unwinding in the debugger. Spilling them into virtual VGPR lanes + // involve regalloc to allocate the physical VGPRs and that might + // cause intermediate spill/split of such liveranges for successful + // allocation. This would result in broken CFI encoding unless the + // regalloc aware CFI generation to insert new CFIs along with the + // intermediate spills is implemented. There is no such support + // currently exist in the LLVM compiler. + if (FuncInfo->allocateSGPRSpillToVGPRLane(MF, FI, true)) { + NewReservedRegs = true; + bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex( + MI, FI, nullptr, Indexes, LIS, true); + if (!Spilled) + llvm_unreachable( + "failed to spill SGPR to physical VGPR lane when allocated"); + } + } else { + if (FuncInfo->allocateSGPRSpillToVGPRLane(MF, FI)) { + bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex( + MI, FI, nullptr, Indexes, LIS); + if (!Spilled) + llvm_unreachable( + "failed to spill SGPR to virtual VGPR lane when allocated"); + SpillFIs.set(FI); + SpilledToVirtVGPRLanes = true; + } } } } - // FIXME: Adding to live-ins redundant with reserving registers. - for (MachineBasicBlock &MBB : MF) { - for (auto Reg : FuncInfo->getSGPRSpillVGPRs()) - MBB.addLiveIn(Reg); - MBB.sortUniqueLiveIns(); + if (SpilledToVirtVGPRLanes) { + extendWWMVirtRegLiveness(MF, LIS); + if (LIS) { + // Compute the LiveInterval for the newly created virtual registers. + for (auto Reg : FuncInfo->getSGPRSpillVGPRs()) + LIS->createAndComputeVirtRegInterval(Reg); + } + } + for (MachineBasicBlock &MBB : MF) { // FIXME: The dead frame indices are replaced with a null register from // the debug value instructions. We should instead, update it with the // correct register value. But not sure the register value alone is @@ -334,6 +421,10 @@ // lane". FuncInfo->removeDeadFrameIndices(MFI, /*ResetSGPRSpillStackIDs*/ false); + MadeChange = true; + } + + if (SpilledToVirtVGPRLanes) { const TargetRegisterClass *RC = TRI->getWaveMaskRegClass(); // Shift back the reserved SGPR for EXEC copy into the lowest range. // This SGPR is reserved to handle the whole-wave spill/copy operations @@ -342,20 +433,21 @@ if (UnusedLowSGPR && TRI->getHWRegIndex(UnusedLowSGPR) < TRI->getHWRegIndex(FuncInfo->getSGPRForEXECCopy())) FuncInfo->setSGPRForEXECCopy(UnusedLowSGPR); - - MadeChange = true; } else { - // No SGPR spills and hence there won't be any WWM spills/copies. Reset the - // SGPR reserved for EXEC copy. + // No SGPR spills to virtual VGPR lanes and hence there won't be any WWM + // spills/copies. Reset the SGPR reserved for EXEC copy. FuncInfo->setSGPRForEXECCopy(AMDGPU::NoRegister); } SaveBlocks.clear(); RestoreBlocks.clear(); - // Updated the reserved registers with any VGPRs added for SGPR spills. - if (NewReservedRegs) - MRI.freezeReservedRegs(MF); + // Updated the reserved registers with any physical VGPRs added for SGPR + // spills. + if (NewReservedRegs) { + for (Register Reg : FuncInfo->getWWMReservedRegs()) + MRI.reserveReg(Reg, TRI); + } return MadeChange; } diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -496,15 +496,16 @@ }; private: - // To track VGPR + lane index for each subregister of the SGPR spilled to - // frameindex key during SILowerSGPRSpills pass. - DenseMap> SGPRSpillToVGPRLanes; - // To track VGPR + lane index for spilling special SGPRs like Frame Pointer - // identified during PrologEpilogInserter. + // To track virtual VGPR + lane index for each subregister of the SGPR spilled + // to frameindex key during SILowerSGPRSpills pass. DenseMap> - PrologEpilogSGPRSpillToVGPRLanes; - unsigned NumVGPRSpillLanes = 0; - unsigned NumVGPRPrologEpilogSpillLanes = 0; + SGPRSpillsToVirtualVGPRLanes; + // To track physical VGPR + lane index for CSR SGPR spills and special SGPRs + // like Frame Pointer identified during PrologEpilogInserter. + DenseMap> + SGPRSpillsToPhysicalVGPRLanes; + unsigned NumVirtualVGPRSpillLanes = 0; + unsigned NumPhysicalVGPRSpillLanes = 0; SmallVector SpillVGPRs; using WWMSpillsMap = MapVector; // To track the registers used in instructions that can potentially modify the @@ -548,10 +549,10 @@ private: Register VGPRForAGPRCopy; - bool allocateVGPRForSGPRSpills(MachineFunction &MF, int FI, - unsigned LaneIndex); - bool allocateVGPRForPrologEpilogSGPRSpills(MachineFunction &MF, int FI, - unsigned LaneIndex); + bool allocateVirtualVGPRForSGPRSpills(MachineFunction &MF, int FI, + unsigned LaneIndex); + bool allocatePhysicalVGPRForSGPRSpills(MachineFunction &MF, int FI, + unsigned LaneIndex); public: Register getVGPRForAGPRCopy() const { @@ -583,9 +584,9 @@ SIModeRegisterDefaults getMode() const { return Mode; } ArrayRef - getSGPRSpillToVGPRLanes(int FrameIndex) const { - auto I = SGPRSpillToVGPRLanes.find(FrameIndex); - return (I == SGPRSpillToVGPRLanes.end()) + getSGPRSpillToVirtualVGPRLanes(int FrameIndex) const { + auto I = SGPRSpillsToVirtualVGPRLanes.find(FrameIndex); + return (I == SGPRSpillsToVirtualVGPRLanes.end()) ? ArrayRef() : ArrayRef(I->second); } @@ -647,9 +648,9 @@ } ArrayRef - getPrologEpilogSGPRSpillToVGPRLanes(int FrameIndex) const { - auto I = PrologEpilogSGPRSpillToVGPRLanes.find(FrameIndex); - return (I == PrologEpilogSGPRSpillToVGPRLanes.end()) + getSGPRSpillToPhysicalVGPRLanes(int FrameIndex) const { + auto I = SGPRSpillsToPhysicalVGPRLanes.find(FrameIndex); + return (I == SGPRSpillsToPhysicalVGPRLanes.end()) ? ArrayRef() : ArrayRef(I->second); } diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -314,37 +314,23 @@ return false; } -bool SIMachineFunctionInfo::allocateVGPRForSGPRSpills(MachineFunction &MF, - int FI, - unsigned LaneIndex) { - const GCNSubtarget &ST = MF.getSubtarget(); - const SIRegisterInfo *TRI = ST.getRegisterInfo(); +bool SIMachineFunctionInfo::allocateVirtualVGPRForSGPRSpills( + MachineFunction &MF, int FI, unsigned LaneIndex) { MachineRegisterInfo &MRI = MF.getRegInfo(); Register LaneVGPR; if (!LaneIndex) { - LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF); - if (LaneVGPR == AMDGPU::NoRegister) { - // We have no VGPRs left for spilling SGPRs. Reset because we will not - // partially spill the SGPR to VGPRs. - SGPRSpillToVGPRLanes.erase(FI); - return false; - } - + LaneVGPR = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); SpillVGPRs.push_back(LaneVGPR); - // Add this register as live-in to all blocks to avoid machine verifier - // complaining about use of an undefined physical register. - for (MachineBasicBlock &BB : MF) - BB.addLiveIn(LaneVGPR); } else { LaneVGPR = SpillVGPRs.back(); } - SGPRSpillToVGPRLanes[FI].push_back( + SGPRSpillsToVirtualVGPRLanes[FI].push_back( SIRegisterInfo::SpilledReg(LaneVGPR, LaneIndex)); return true; } -bool SIMachineFunctionInfo::allocateVGPRForPrologEpilogSGPRSpills( +bool SIMachineFunctionInfo::allocatePhysicalVGPRForSGPRSpills( MachineFunction &MF, int FI, unsigned LaneIndex) { const GCNSubtarget &ST = MF.getSubtarget(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); @@ -355,16 +341,21 @@ if (LaneVGPR == AMDGPU::NoRegister) { // We have no VGPRs left for spilling SGPRs. Reset because we will not // partially spill the SGPR to VGPRs. - PrologEpilogSGPRSpillToVGPRLanes.erase(FI); + SGPRSpillsToPhysicalVGPRLanes.erase(FI); return false; } allocateWWMSpill(MF, LaneVGPR); + reserveWWMRegister(LaneVGPR); + for (MachineBasicBlock &MBB : MF) { + MBB.addLiveIn(LaneVGPR); + MBB.sortUniqueLiveIns(); + } } else { - LaneVGPR = WWMSpills.back().first; + LaneVGPR = WWMReservedRegs.back(); } - PrologEpilogSGPRSpillToVGPRLanes[FI].push_back( + SGPRSpillsToPhysicalVGPRLanes[FI].push_back( SIRegisterInfo::SpilledReg(LaneVGPR, LaneIndex)); return true; } @@ -373,8 +364,8 @@ int FI, bool IsPrologEpilog) { std::vector &SpillLanes = - IsPrologEpilog ? PrologEpilogSGPRSpillToVGPRLanes[FI] - : SGPRSpillToVGPRLanes[FI]; + IsPrologEpilog ? SGPRSpillsToPhysicalVGPRLanes[FI] + : SGPRSpillsToVirtualVGPRLanes[FI]; // This has already been allocated. if (!SpillLanes.empty()) @@ -395,15 +386,14 @@ "not spilling SGPRs to VGPRs"); unsigned &NumSpillLanes = - IsPrologEpilog ? NumVGPRPrologEpilogSpillLanes : NumVGPRSpillLanes; + IsPrologEpilog ? NumPhysicalVGPRSpillLanes : NumVirtualVGPRSpillLanes; for (unsigned I = 0; I < NumLanes; ++I, ++NumSpillLanes) { unsigned LaneIndex = (NumSpillLanes % WaveSize); - bool Allocated = - IsPrologEpilog - ? allocateVGPRForPrologEpilogSGPRSpills(MF, FI, LaneIndex) - : allocateVGPRForSGPRSpills(MF, FI, LaneIndex); + bool Allocated = IsPrologEpilog + ? allocatePhysicalVGPRForSGPRSpills(MF, FI, LaneIndex) + : allocateVirtualVGPRForSGPRSpills(MF, FI, LaneIndex); if (!Allocated) { NumSpillLanes -= I; return false; @@ -484,16 +474,25 @@ bool SIMachineFunctionInfo::removeDeadFrameIndices( MachineFrameInfo &MFI, bool ResetSGPRSpillStackIDs) { - // Remove dead frame indices from function frame. And also make sure to remove - // the frame indices from `SGPRSpillToVGPRLanes` data structure, otherwise, it - // could result in an unexpected side effect and bug, in case of any - // re-mapping of freed frame indices by later pass(es) like "stack slot + // Remove dead frame indices from function frame, however keep FP & BP since + // spills for them haven't been inserted yet. And also make sure to remove the + // frame indices from `SGPRSpillsToVirtualVGPRLanes` data structure, + // otherwise, it could result in an unexpected side effect and bug, in case of + // any re-mapping of freed frame indices by later pass(es) like "stack slot // coloring". - for (auto &R : make_early_inc_range(SGPRSpillToVGPRLanes)) { + for (auto &R : make_early_inc_range(SGPRSpillsToVirtualVGPRLanes)) { MFI.RemoveStackObject(R.first); - SGPRSpillToVGPRLanes.erase(R.first); + SGPRSpillsToVirtualVGPRLanes.erase(R.first); } + // Remove the dead frame indices of CSR SGPRs which are spilled to physical + // VGPR lanes during SILowerSGPRSpills pass. + if (!ResetSGPRSpillStackIDs) { + for (auto &R : make_early_inc_range(SGPRSpillsToPhysicalVGPRLanes)) { + MFI.RemoveStackObject(R.first); + SGPRSpillsToPhysicalVGPRLanes.erase(R.first); + } + } bool HaveSGPRToMemory = false; if (ResetSGPRSpillStackIDs) { diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -142,14 +142,17 @@ void buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index, int Offset, bool IsLoad, bool IsKill = true) const; - /// If \p OnlyToVGPR is true, this will only succeed if this + /// If \p OnlyToVGPR is true, this will only succeed if this manages to find a + /// free VGPR lane to spill. bool spillSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, SlotIndexes *Indexes = nullptr, LiveIntervals *LIS = nullptr, - bool OnlyToVGPR = false) const; + bool OnlyToVGPR = false, + bool SpillToPhysVGPRLane = false) const; bool restoreSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, SlotIndexes *Indexes = nullptr, LiveIntervals *LIS = nullptr, - bool OnlyToVGPR = false) const; + bool OnlyToVGPR = false, + bool SpillToPhysVGPRLane = false) const; bool spillEmergencySGPR(MachineBasicBlock::iterator MI, MachineBasicBlock &RestoreMBB, Register SGPR, @@ -163,10 +166,10 @@ unsigned FIOperandNum, RegScavenger *RS) const override; - bool eliminateSGPRToVGPRSpillFrameIndex(MachineBasicBlock::iterator MI, - int FI, RegScavenger *RS, - SlotIndexes *Indexes = nullptr, - LiveIntervals *LIS = nullptr) const; + bool eliminateSGPRToVGPRSpillFrameIndex( + MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, + SlotIndexes *Indexes = nullptr, LiveIntervals *LIS = nullptr, + bool SpillToPhysVGPRLane = false) const; StringRef getRegAsmName(MCRegister Reg) const override; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -712,9 +712,6 @@ for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs()) reserveRegisterTuples(Reserved, Reg); - for (auto Reg : MFI->getSGPRSpillVGPRs()) - reserveRegisterTuples(Reserved, Reg); - return Reserved; } @@ -1736,10 +1733,13 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index, RegScavenger *RS, SlotIndexes *Indexes, - LiveIntervals *LIS, bool OnlyToVGPR) const { + LiveIntervals *LIS, bool OnlyToVGPR, + bool SpillToPhysVGPRLane) const { SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS); - ArrayRef VGPRSpills = SB.MFI.getSGPRSpillToVGPRLanes(Index); + ArrayRef VGPRSpills = + SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index) + : SB.MFI.getSGPRSpillToVirtualVGPRLanes(Index); bool SpillToVGPR = !VGPRSpills.empty(); if (OnlyToVGPR && !SpillToVGPR) return false; @@ -1856,10 +1856,13 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, int Index, RegScavenger *RS, SlotIndexes *Indexes, - LiveIntervals *LIS, bool OnlyToVGPR) const { + LiveIntervals *LIS, bool OnlyToVGPR, + bool SpillToPhysVGPRLane) const { SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS); - ArrayRef VGPRSpills = SB.MFI.getSGPRSpillToVGPRLanes(Index); + ArrayRef VGPRSpills = + SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index) + : SB.MFI.getSGPRSpillToVirtualVGPRLanes(Index); bool SpillToVGPR = !VGPRSpills.empty(); if (OnlyToVGPR && !SpillToVGPR) return false; @@ -2005,7 +2008,7 @@ /// handled. bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex( MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, - SlotIndexes *Indexes, LiveIntervals *LIS) const { + SlotIndexes *Indexes, LiveIntervals *LIS, bool SpillToPhysVGPRLane) const { switch (MI->getOpcode()) { case AMDGPU::SI_SPILL_S1024_SAVE: case AMDGPU::SI_SPILL_S512_SAVE: @@ -2021,7 +2024,7 @@ case AMDGPU::SI_SPILL_S96_SAVE: case AMDGPU::SI_SPILL_S64_SAVE: case AMDGPU::SI_SPILL_S32_SAVE: - return spillSGPR(MI, FI, RS, Indexes, LIS, true); + return spillSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane); case AMDGPU::SI_SPILL_S1024_RESTORE: case AMDGPU::SI_SPILL_S512_RESTORE: case AMDGPU::SI_SPILL_S384_RESTORE: @@ -2036,7 +2039,7 @@ case AMDGPU::SI_SPILL_S96_RESTORE: case AMDGPU::SI_SPILL_S64_RESTORE: case AMDGPU::SI_SPILL_S32_RESTORE: - return restoreSGPR(MI, FI, RS, Indexes, LIS, true); + return restoreSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane); default: llvm_unreachable("not an SGPR spill instruction"); } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll @@ -11,13 +11,12 @@ ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[18:19] +; CHECK-NEXT: v_writelane_b32 v40, s16, 2 ; CHECK-NEXT: s_addk_i32 s32, 0x400 ; CHECK-NEXT: v_writelane_b32 v40, s30, 0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: v_writelane_b32 v41, s16, 0 ; CHECK-NEXT: v_writelane_b32 v40, s31, 1 ; CHECK-NEXT: s_getpc_b64 s[16:17] ; CHECK-NEXT: s_add_u32 s16, s16, ext@rel32@lo+4 @@ -28,10 +27,9 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_readlane_b32 s31, v40, 1 ; CHECK-NEXT: v_readlane_b32 s30, v40, 0 -; CHECK-NEXT: v_readlane_b32 s4, v41, 0 +; CHECK-NEXT: v_readlane_b32 s4, v40, 2 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[6:7] ; CHECK-NEXT: s_addk_i32 s32, 0xfc00 ; CHECK-NEXT: s_mov_b32 s33, s4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll @@ -228,18 +228,17 @@ ; MUBUF-NEXT: s_mov_b32 s33, s32 ; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 ; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; MUBUF-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; MUBUF-NEXT: s_mov_b64 exec, s[6:7] ; MUBUF-NEXT: s_addk_i32 s32, 0x400 ; MUBUF-NEXT: v_mov_b32_e32 v0, 9 ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; MUBUF-NEXT: v_mov_b32_e32 v0, 10 +; MUBUF-NEXT: v_writelane_b32 v40, s4, 2 ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; MUBUF-NEXT: v_mov_b32_e32 v0, 11 ; MUBUF-NEXT: v_writelane_b32 v40, s30, 0 ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 ; MUBUF-NEXT: v_mov_b32_e32 v0, 12 -; MUBUF-NEXT: v_writelane_b32 v41, s4, 0 ; MUBUF-NEXT: v_writelane_b32 v40, s31, 1 ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 ; MUBUF-NEXT: s_getpc_b64 s[4:5] @@ -248,10 +247,9 @@ ; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5] ; MUBUF-NEXT: v_readlane_b32 s31, v40, 1 ; MUBUF-NEXT: v_readlane_b32 s30, v40, 0 -; MUBUF-NEXT: v_readlane_b32 s4, v41, 0 +; MUBUF-NEXT: v_readlane_b32 s4, v40, 2 ; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 ; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; MUBUF-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; MUBUF-NEXT: s_mov_b64 exec, s[6:7] ; MUBUF-NEXT: s_addk_i32 s32, 0xfc00 ; MUBUF-NEXT: s_mov_b32 s33, s4 @@ -265,10 +263,9 @@ ; FLATSCR-NEXT: s_mov_b32 s33, s32 ; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 ; FLATSCR-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; FLATSCR-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] ; FLATSCR-NEXT: s_add_i32 s32, s32, 16 -; FLATSCR-NEXT: v_writelane_b32 v41, s0, 0 +; FLATSCR-NEXT: v_writelane_b32 v40, s0, 2 ; FLATSCR-NEXT: s_add_u32 s0, s32, 4 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 9 ; FLATSCR-NEXT: scratch_store_dword off, v0, s0 @@ -289,10 +286,9 @@ ; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] ; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 ; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0 -; FLATSCR-NEXT: v_readlane_b32 s0, v41, 0 +; FLATSCR-NEXT: v_readlane_b32 s0, v40, 2 ; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 ; FLATSCR-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload -; FLATSCR-NEXT: scratch_load_dword v41, off, s33 offset:4 ; 4-byte Folded Reload ; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] ; FLATSCR-NEXT: s_add_i32 s32, s32, -16 ; FLATSCR-NEXT: s_mov_b32 s33, s0 @@ -310,13 +306,12 @@ ; MUBUF-NEXT: s_mov_b32 s33, s32 ; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 ; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; MUBUF-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; MUBUF-NEXT: s_mov_b64 exec, s[6:7] ; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen ; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:4 ; MUBUF-NEXT: s_addk_i32 s32, 0x400 +; MUBUF-NEXT: v_writelane_b32 v40, s4, 2 ; MUBUF-NEXT: v_writelane_b32 v40, s30, 0 -; MUBUF-NEXT: v_writelane_b32 v41, s4, 0 ; MUBUF-NEXT: v_writelane_b32 v40, s31, 1 ; MUBUF-NEXT: s_getpc_b64 s[4:5] ; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_byval@rel32@lo+4 @@ -377,10 +372,9 @@ ; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5] ; MUBUF-NEXT: v_readlane_b32 s31, v40, 1 ; MUBUF-NEXT: v_readlane_b32 s30, v40, 0 -; MUBUF-NEXT: v_readlane_b32 s4, v41, 0 +; MUBUF-NEXT: v_readlane_b32 s4, v40, 2 ; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 ; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; MUBUF-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; MUBUF-NEXT: s_mov_b64 exec, s[6:7] ; MUBUF-NEXT: s_addk_i32 s32, 0xfc00 ; MUBUF-NEXT: s_mov_b32 s33, s4 @@ -394,12 +388,11 @@ ; FLATSCR-NEXT: s_mov_b32 s33, s32 ; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 ; FLATSCR-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; FLATSCR-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] ; FLATSCR-NEXT: scratch_load_dwordx2 v[1:2], v0, off ; FLATSCR-NEXT: s_add_i32 s32, s32, 16 ; FLATSCR-NEXT: v_add_u32_e32 v3, 8, v0 -; FLATSCR-NEXT: v_writelane_b32 v41, s0, 0 +; FLATSCR-NEXT: v_writelane_b32 v40, s0, 2 ; FLATSCR-NEXT: s_add_u32 s0, s32, 8 ; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0 ; FLATSCR-NEXT: s_add_u32 s2, s32, 56 @@ -444,10 +437,9 @@ ; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] ; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 ; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0 -; FLATSCR-NEXT: v_readlane_b32 s0, v41, 0 +; FLATSCR-NEXT: v_readlane_b32 s0, v40, 2 ; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 ; FLATSCR-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload -; FLATSCR-NEXT: scratch_load_dword v41, off, s33 offset:4 ; 4-byte Folded Reload ; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] ; FLATSCR-NEXT: s_add_i32 s32, s32, -16 ; FLATSCR-NEXT: s_mov_b32 s33, s0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll @@ -12,32 +12,39 @@ ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_xor_saveexec_b32 s4, -1 -; CHECK-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b32 exec_lo, s4 -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; CHECK-NEXT: ; implicit-def: $vgpr8 +; CHECK-NEXT: v_mov_b32_e32 v8, v0 +; CHECK-NEXT: s_or_saveexec_b32 s21, -1 +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b32 exec_lo, s21 +; CHECK-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; CHECK-NEXT: v_mov_b32_e32 v15, v1 -; CHECK-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; CHECK-NEXT: v_mov_b32_e32 v14, v2 -; CHECK-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; CHECK-NEXT: v_mov_b32_e32 v13, v3 -; CHECK-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; CHECK-NEXT: v_mov_b32_e32 v12, v4 -; CHECK-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; CHECK-NEXT: v_mov_b32_e32 v11, v5 -; CHECK-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; CHECK-NEXT: v_mov_b32_e32 v10, v6 -; CHECK-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill ; CHECK-NEXT: v_mov_b32_e32 v9, v7 -; CHECK-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 killed $exec -; CHECK-NEXT: v_mov_b32_e32 v1, v15 -; CHECK-NEXT: v_mov_b32_e32 v2, v14 -; CHECK-NEXT: v_mov_b32_e32 v3, v13 -; CHECK-NEXT: v_mov_b32_e32 v4, v12 -; CHECK-NEXT: v_mov_b32_e32 v5, v11 -; CHECK-NEXT: v_mov_b32_e32 v6, v10 -; CHECK-NEXT: v_mov_b32_e32 v7, v9 -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; CHECK-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 killed $exec +; CHECK-NEXT: v_mov_b32_e32 v2, v15 +; CHECK-NEXT: v_mov_b32_e32 v3, v14 +; CHECK-NEXT: v_mov_b32_e32 v4, v13 +; CHECK-NEXT: v_mov_b32_e32 v5, v12 +; CHECK-NEXT: v_mov_b32_e32 v6, v11 +; CHECK-NEXT: v_mov_b32_e32 v7, v10 +; CHECK-NEXT: v_mov_b32_e32 v8, v9 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill @@ -45,34 +52,40 @@ ; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; CHECK-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b32 s8, 0 ; CHECK-NEXT: s_mov_b32 s4, s8 ; CHECK-NEXT: s_mov_b32 s5, s8 ; CHECK-NEXT: s_mov_b32 s6, s8 ; CHECK-NEXT: s_mov_b32 s7, s8 -; CHECK-NEXT: v_writelane_b32 v8, s4, 0 -; CHECK-NEXT: v_writelane_b32 v8, s5, 1 -; CHECK-NEXT: v_writelane_b32 v8, s6, 2 -; CHECK-NEXT: v_writelane_b32 v8, s7, 3 +; CHECK-NEXT: v_writelane_b32 v0, s4, 0 +; CHECK-NEXT: v_writelane_b32 v0, s5, 1 +; CHECK-NEXT: v_writelane_b32 v0, s6, 2 +; CHECK-NEXT: v_writelane_b32 v0, s7, 3 ; CHECK-NEXT: s_mov_b32 s6, 0 ; CHECK-NEXT: s_mov_b32 s4, s6 ; CHECK-NEXT: s_mov_b32 s5, s6 -; CHECK-NEXT: v_mov_b32_e32 v0, s4 -; CHECK-NEXT: v_mov_b32_e32 v1, s5 -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; CHECK-NEXT: v_mov_b32_e32 v1, s4 +; CHECK-NEXT: v_mov_b32_e32 v2, s5 ; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b32 s4, exec_lo -; CHECK-NEXT: v_writelane_b32 v8, s4, 4 +; CHECK-NEXT: v_writelane_b32 v0, s4, 4 +; CHECK-NEXT: s_or_saveexec_b32 s21, -1 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b32 exec_lo, s21 ; CHECK-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; CHECK-NEXT: s_or_saveexec_b32 s21, -1 +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b32 exec_lo, s21 +; CHECK-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; CHECK-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload @@ -80,15 +93,16 @@ ; CHECK-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; CHECK-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; CHECK-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_readfirstlane_b32 s12, v7 -; CHECK-NEXT: v_readfirstlane_b32 s10, v6 -; CHECK-NEXT: v_readfirstlane_b32 s9, v5 -; CHECK-NEXT: v_readfirstlane_b32 s8, v4 -; CHECK-NEXT: v_readfirstlane_b32 s7, v3 -; CHECK-NEXT: v_readfirstlane_b32 s6, v2 -; CHECK-NEXT: v_readfirstlane_b32 s5, v1 -; CHECK-NEXT: v_readfirstlane_b32 s4, v0 +; CHECK-NEXT: v_readfirstlane_b32 s12, v8 +; CHECK-NEXT: v_readfirstlane_b32 s10, v7 +; CHECK-NEXT: v_readfirstlane_b32 s9, v6 +; CHECK-NEXT: v_readfirstlane_b32 s8, v5 +; CHECK-NEXT: v_readfirstlane_b32 s7, v4 +; CHECK-NEXT: v_readfirstlane_b32 s6, v3 +; CHECK-NEXT: v_readfirstlane_b32 s5, v2 +; CHECK-NEXT: v_readfirstlane_b32 s4, v1 ; CHECK-NEXT: ; kill: def $sgpr12 killed $sgpr12 def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 ; CHECK-NEXT: s_mov_b32 s13, s10 ; CHECK-NEXT: s_mov_b32 s14, s9 @@ -97,68 +111,84 @@ ; CHECK-NEXT: s_mov_b32 s17, s6 ; CHECK-NEXT: s_mov_b32 s18, s5 ; CHECK-NEXT: s_mov_b32 s19, s4 -; CHECK-NEXT: v_writelane_b32 v8, s12, 5 -; CHECK-NEXT: v_writelane_b32 v8, s13, 6 -; CHECK-NEXT: v_writelane_b32 v8, s14, 7 -; CHECK-NEXT: v_writelane_b32 v8, s15, 8 -; CHECK-NEXT: v_writelane_b32 v8, s16, 9 -; CHECK-NEXT: v_writelane_b32 v8, s17, 10 -; CHECK-NEXT: v_writelane_b32 v8, s18, 11 -; CHECK-NEXT: v_writelane_b32 v8, s19, 12 -; CHECK-NEXT: v_mov_b32_e32 v6, v9 -; CHECK-NEXT: v_mov_b32_e32 v7, v10 -; CHECK-NEXT: v_mov_b32_e32 v4, v11 -; CHECK-NEXT: v_mov_b32_e32 v5, v12 -; CHECK-NEXT: v_mov_b32_e32 v2, v13 -; CHECK-NEXT: v_mov_b32_e32 v3, v14 -; CHECK-NEXT: v_mov_b32_e32 v0, v15 -; CHECK-NEXT: v_mov_b32_e32 v1, v16 +; CHECK-NEXT: v_writelane_b32 v0, s12, 5 +; CHECK-NEXT: v_writelane_b32 v0, s13, 6 +; CHECK-NEXT: v_writelane_b32 v0, s14, 7 +; CHECK-NEXT: v_writelane_b32 v0, s15, 8 +; CHECK-NEXT: v_writelane_b32 v0, s16, 9 +; CHECK-NEXT: v_writelane_b32 v0, s17, 10 +; CHECK-NEXT: v_writelane_b32 v0, s18, 11 +; CHECK-NEXT: v_writelane_b32 v0, s19, 12 +; CHECK-NEXT: v_mov_b32_e32 v7, v9 +; CHECK-NEXT: v_mov_b32_e32 v8, v10 +; CHECK-NEXT: v_mov_b32_e32 v5, v11 +; CHECK-NEXT: v_mov_b32_e32 v6, v12 +; CHECK-NEXT: v_mov_b32_e32 v3, v13 +; CHECK-NEXT: v_mov_b32_e32 v4, v14 +; CHECK-NEXT: v_mov_b32_e32 v1, v15 +; CHECK-NEXT: v_mov_b32_e32 v2, v16 ; CHECK-NEXT: s_mov_b64 s[4:5], s[12:13] ; CHECK-NEXT: s_mov_b64 s[10:11], s[14:15] ; CHECK-NEXT: s_mov_b64 s[8:9], s[16:17] ; CHECK-NEXT: s_mov_b64 s[6:7], s[18:19] -; CHECK-NEXT: v_cmp_eq_u64_e64 s4, s[4:5], v[6:7] -; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[10:11], v[4:5] +; CHECK-NEXT: v_cmp_eq_u64_e64 s4, s[4:5], v[7:8] +; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[10:11], v[5:6] ; CHECK-NEXT: s_and_b32 s4, s4, s5 -; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[8:9], v[2:3] +; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[8:9], v[3:4] ; CHECK-NEXT: s_and_b32 s4, s4, s5 -; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[6:7], v[0:1] +; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[6:7], v[1:2] ; CHECK-NEXT: s_and_b32 s4, s4, s5 ; CHECK-NEXT: s_and_saveexec_b32 s4, s4 -; CHECK-NEXT: v_writelane_b32 v8, s4, 13 +; CHECK-NEXT: v_writelane_b32 v0, s4, 13 +; CHECK-NEXT: s_or_saveexec_b32 s21, -1 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b32 exec_lo, s21 ; CHECK-NEXT: ; %bb.2: ; in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: v_readlane_b32 s4, v8, 13 -; CHECK-NEXT: v_readlane_b32 s8, v8, 5 -; CHECK-NEXT: v_readlane_b32 s9, v8, 6 -; CHECK-NEXT: v_readlane_b32 s10, v8, 7 -; CHECK-NEXT: v_readlane_b32 s11, v8, 8 -; CHECK-NEXT: v_readlane_b32 s12, v8, 9 -; CHECK-NEXT: v_readlane_b32 s13, v8, 10 -; CHECK-NEXT: v_readlane_b32 s14, v8, 11 -; CHECK-NEXT: v_readlane_b32 s15, v8, 12 -; CHECK-NEXT: v_readlane_b32 s16, v8, 0 -; CHECK-NEXT: v_readlane_b32 s17, v8, 1 -; CHECK-NEXT: v_readlane_b32 s18, v8, 2 -; CHECK-NEXT: v_readlane_b32 s19, v8, 3 +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; CHECK-NEXT: s_or_saveexec_b32 s21, -1 +; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b32 exec_lo, s21 ; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_readlane_b32 s4, v2, 13 +; CHECK-NEXT: v_readlane_b32 s8, v2, 5 +; CHECK-NEXT: v_readlane_b32 s9, v2, 6 +; CHECK-NEXT: v_readlane_b32 s10, v2, 7 +; CHECK-NEXT: v_readlane_b32 s11, v2, 8 +; CHECK-NEXT: v_readlane_b32 s12, v2, 9 +; CHECK-NEXT: v_readlane_b32 s13, v2, 10 +; CHECK-NEXT: v_readlane_b32 s14, v2, 11 +; CHECK-NEXT: v_readlane_b32 s15, v2, 12 +; CHECK-NEXT: v_readlane_b32 s16, v2, 0 +; CHECK-NEXT: v_readlane_b32 s17, v2, 1 +; CHECK-NEXT: v_readlane_b32 s18, v2, 2 +; CHECK-NEXT: v_readlane_b32 s19, v2, 3 ; CHECK-NEXT: image_sample v0, v[0:1], s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_2D ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; CHECK-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; CHECK-NEXT: s_cbranch_execnz .LBB0_1 ; CHECK-NEXT: ; %bb.3: -; CHECK-NEXT: v_readlane_b32 s4, v8, 4 +; CHECK-NEXT: s_or_saveexec_b32 s21, -1 +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b32 exec_lo, s21 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_readlane_b32 s4, v0, 4 ; CHECK-NEXT: s_mov_b32 exec_lo, s4 ; CHECK-NEXT: ; %bb.4: -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; CHECK-NEXT: s_or_saveexec_b32 s21, -1 +; CHECK-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b32 exec_lo, s21 +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; CHECK-NEXT: ; implicit-def: $sgpr4 ; CHECK-NEXT: v_mov_b32_e32 v1, s4 ; CHECK-NEXT: v_mov_b32_e32 v2, s4 ; CHECK-NEXT: v_mov_b32_e32 v3, s4 +; CHECK-NEXT: ; kill: killed $vgpr4 ; CHECK-NEXT: s_xor_saveexec_b32 s4, -1 -; CHECK-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b32 exec_lo, s4 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll @@ -236,23 +236,21 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dword v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_writelane_b32 v40, s16, 2 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v41, s16, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], 0 ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s4, v41, 0 +; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll --- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll +++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll @@ -21,11 +21,10 @@ ; FIXEDABI-NEXT: s_mov_b32 s33, s32 ; FIXEDABI-NEXT: s_or_saveexec_b64 s[18:19], -1 ; FIXEDABI-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; FIXEDABI-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; FIXEDABI-NEXT: s_mov_b64 exec, s[18:19] +; FIXEDABI-NEXT: v_writelane_b32 v40, s16, 2 ; FIXEDABI-NEXT: s_addk_i32 s32, 0x400 ; FIXEDABI-NEXT: v_writelane_b32 v40, s30, 0 -; FIXEDABI-NEXT: v_writelane_b32 v41, s16, 0 ; FIXEDABI-NEXT: v_writelane_b32 v40, s31, 1 ; FIXEDABI-NEXT: s_getpc_b64 s[16:17] ; FIXEDABI-NEXT: s_add_u32 s16, s16, requires_all_inputs@rel32@lo+4 @@ -33,10 +32,9 @@ ; FIXEDABI-NEXT: s_swappc_b64 s[30:31], s[16:17] ; FIXEDABI-NEXT: v_readlane_b32 s31, v40, 1 ; FIXEDABI-NEXT: v_readlane_b32 s30, v40, 0 -; FIXEDABI-NEXT: v_readlane_b32 s4, v41, 0 +; FIXEDABI-NEXT: v_readlane_b32 s4, v40, 2 ; FIXEDABI-NEXT: s_or_saveexec_b64 s[6:7], -1 ; FIXEDABI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; FIXEDABI-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; FIXEDABI-NEXT: s_mov_b64 exec, s[6:7] ; FIXEDABI-NEXT: s_addk_i32 s32, 0xfc00 ; FIXEDABI-NEXT: s_mov_b32 s33, s4 diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll --- a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll +++ b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll @@ -178,19 +178,19 @@ ; GFX8-NEXT: .sgpr_count: 0x28{{$}} ; GFX9-NEXT: .sgpr_count: 0x2c{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}} -; GCN-NEXT: .vgpr_count: 0x2c{{$}} +; GCN-NEXT: .vgpr_count: 0x2b{{$}} ; GCN-NEXT: no_stack_extern_call_many_args: ; GCN-NEXT: .lds_size: 0{{$}} ; GFX8-NEXT: .sgpr_count: 0x28{{$}} ; GFX9-NEXT: .sgpr_count: 0x2c{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x90{{$}} -; GCN-NEXT: .vgpr_count: 0x2c{{$}} +; GCN-NEXT: .vgpr_count: 0x2b{{$}} ; GCN-NEXT: no_stack_indirect_call: ; GCN-NEXT: .lds_size: 0{{$}} ; GFX8-NEXT: .sgpr_count: 0x28{{$}} ; GFX9-NEXT: .sgpr_count: 0x2c{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}} -; GCN-NEXT: .vgpr_count: 0x2c{{$}} +; GCN-NEXT: .vgpr_count: 0x2b{{$}} ; GCN-NEXT: simple_lds: ; GCN-NEXT: .lds_size: 0x100{{$}} ; GCN-NEXT: .sgpr_count: 0x20{{$}} @@ -200,7 +200,7 @@ ; GCN-NEXT: .lds_size: 0x100{{$}} ; GCN-NEXT: .sgpr_count: 0x28{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}} -; GCN-NEXT: .vgpr_count: 0x2a{{$}} +; GCN-NEXT: .vgpr_count: 0x29{{$}} ; GCN-NEXT: simple_stack: ; GCN-NEXT: .lds_size: 0{{$}} ; GCN-NEXT: .sgpr_count: 0x21{{$}} @@ -216,16 +216,16 @@ ; GFX8-NEXT: .sgpr_count: 0x28{{$}} ; GFX9-NEXT: .sgpr_count: 0x2c{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x20{{$}} -; GCN-NEXT: .vgpr_count: 0x2c{{$}} +; GCN-NEXT: .vgpr_count: 0x2b{{$}} ; GCN-NEXT: simple_stack_indirect_call: ; GCN-NEXT: .lds_size: 0{{$}} ; GFX8-NEXT: .sgpr_count: 0x28{{$}} ; GFX9-NEXT: .sgpr_count: 0x2c{{$}} -; GCN-NEXT: .stack_frame_size_in_bytes: 0x30{{$}} -; GCN-NEXT: .vgpr_count: 0x2c{{$}} +; GCN-NEXT: .stack_frame_size_in_bytes: 0x20{{$}} +; GCN-NEXT: .vgpr_count: 0x2b{{$}} ; GCN-NEXT: simple_stack_recurse: ; GCN-NEXT: .lds_size: 0{{$}} ; GCN-NEXT: .sgpr_count: 0x28{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x20{{$}} -; GCN-NEXT: .vgpr_count: 0x2b{{$}} +; GCN-NEXT: .vgpr_count: 0x2a{{$}} ; GCN-NEXT: ... diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -1371,7 +1371,7 @@ ; GCN-LABEL: test_call: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s10, s33 +; GCN-NEXT: s_mov_b32 s8, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1395,14 +1395,14 @@ ; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s10 +; GCN-NEXT: s_mov_b32 s33, s8 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_call: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s10, s33 +; GFX7-NEXT: s_mov_b32 s8, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1425,14 +1425,14 @@ ; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 -; GFX7-NEXT: s_mov_b32 s33, s10 +; GFX7-NEXT: s_mov_b32 s33, s8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_call: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s8, s33 +; GFX8-NEXT: s_mov_b32 s6, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1455,14 +1455,14 @@ ; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 -; GFX8-NEXT: s_mov_b32 s33, s8 +; GFX8-NEXT: s_mov_b32 s33, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_call: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s33 +; GFX9-NEXT: s_mov_b32 s6, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1484,14 +1484,14 @@ ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: s_mov_b32 s33, s8 +; GFX9-NEXT: s_mov_b32 s33, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_call: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s7, s33 +; GFX10-NEXT: s_mov_b32 s6, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1515,7 +1515,7 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: s_mov_b32 s33, s7 +; GFX10-NEXT: s_mov_b32 s33, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: @@ -1528,7 +1528,7 @@ ; GCN-LABEL: test_call_v2bf16: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s10, s33 +; GCN-NEXT: s_mov_b32 s8, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1556,14 +1556,14 @@ ; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s10 +; GCN-NEXT: s_mov_b32 s33, s8 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_call_v2bf16: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s10, s33 +; GFX7-NEXT: s_mov_b32 s8, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1590,14 +1590,14 @@ ; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 -; GFX7-NEXT: s_mov_b32 s33, s10 +; GFX7-NEXT: s_mov_b32 s33, s8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_call_v2bf16: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s8, s33 +; GFX8-NEXT: s_mov_b32 s6, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1619,14 +1619,14 @@ ; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 -; GFX8-NEXT: s_mov_b32 s33, s8 +; GFX8-NEXT: s_mov_b32 s33, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_call_v2bf16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s33 +; GFX9-NEXT: s_mov_b32 s6, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1648,14 +1648,14 @@ ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: s_mov_b32 s33, s8 +; GFX9-NEXT: s_mov_b32 s33, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_call_v2bf16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s7, s33 +; GFX10-NEXT: s_mov_b32 s6, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1679,7 +1679,7 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: s_mov_b32 s33, s7 +; GFX10-NEXT: s_mov_b32 s33, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: @@ -1692,7 +1692,7 @@ ; GCN-LABEL: test_call_v3bf16: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s10, s33 +; GCN-NEXT: s_mov_b32 s8, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1721,14 +1721,14 @@ ; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s10 +; GCN-NEXT: s_mov_b32 s33, s8 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_call_v3bf16: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s10, s33 +; GFX7-NEXT: s_mov_b32 s8, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1756,14 +1756,14 @@ ; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 -; GFX7-NEXT: s_mov_b32 s33, s10 +; GFX7-NEXT: s_mov_b32 s33, s8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_call_v3bf16: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s8, s33 +; GFX8-NEXT: s_mov_b32 s6, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1789,14 +1789,14 @@ ; GFX8-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 -; GFX8-NEXT: s_mov_b32 s33, s8 +; GFX8-NEXT: s_mov_b32 s33, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_call_v3bf16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s33 +; GFX9-NEXT: s_mov_b32 s6, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1824,14 +1824,14 @@ ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: s_mov_b32 s33, s8 +; GFX9-NEXT: s_mov_b32 s33, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_call_v3bf16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s7, s33 +; GFX10-NEXT: s_mov_b32 s6, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1860,7 +1860,7 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: s_mov_b32 s33, s7 +; GFX10-NEXT: s_mov_b32 s33, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: @@ -1873,7 +1873,7 @@ ; GCN-LABEL: test_call_v4bf16: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s10, s33 +; GCN-NEXT: s_mov_b32 s8, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1909,14 +1909,14 @@ ; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s10 +; GCN-NEXT: s_mov_b32 s33, s8 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_call_v4bf16: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s10, s33 +; GFX7-NEXT: s_mov_b32 s8, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1951,14 +1951,14 @@ ; GFX7-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 -; GFX7-NEXT: s_mov_b32 s33, s10 +; GFX7-NEXT: s_mov_b32 s33, s8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_call_v4bf16: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s8, s33 +; GFX8-NEXT: s_mov_b32 s6, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1991,14 +1991,14 @@ ; GFX8-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 -; GFX8-NEXT: s_mov_b32 s33, s8 +; GFX8-NEXT: s_mov_b32 s33, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_call_v4bf16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s33 +; GFX9-NEXT: s_mov_b32 s6, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill @@ -2026,14 +2026,14 @@ ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: s_mov_b32 s33, s8 +; GFX9-NEXT: s_mov_b32 s33, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_call_v4bf16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s7, s33 +; GFX10-NEXT: s_mov_b32 s6, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill @@ -2063,7 +2063,7 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: s_mov_b32 s33, s7 +; GFX10-NEXT: s_mov_b32 s33, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: @@ -2076,7 +2076,7 @@ ; GCN-LABEL: test_call_v8bf16: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s10, s33 +; GCN-NEXT: s_mov_b32 s8, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill @@ -2128,14 +2128,14 @@ ; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s10 +; GCN-NEXT: s_mov_b32 s33, s8 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_call_v8bf16: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s10, s33 +; GFX7-NEXT: s_mov_b32 s8, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill @@ -2186,14 +2186,14 @@ ; GFX7-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 -; GFX7-NEXT: s_mov_b32 s33, s10 +; GFX7-NEXT: s_mov_b32 s33, s8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_call_v8bf16: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s8, s33 +; GFX8-NEXT: s_mov_b32 s6, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill @@ -2240,14 +2240,14 @@ ; GFX8-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 -; GFX8-NEXT: s_mov_b32 s33, s8 +; GFX8-NEXT: s_mov_b32 s33, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_call_v8bf16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s33 +; GFX9-NEXT: s_mov_b32 s6, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill @@ -2283,14 +2283,14 @@ ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: s_mov_b32 s33, s8 +; GFX9-NEXT: s_mov_b32 s33, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_call_v8bf16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s7, s33 +; GFX10-NEXT: s_mov_b32 s6, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill @@ -2328,7 +2328,7 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: s_mov_b32 s33, s7 +; GFX10-NEXT: s_mov_b32 s33, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: @@ -2341,7 +2341,7 @@ ; GCN-LABEL: test_call_v16bf16: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s10, s33 +; GCN-NEXT: s_mov_b32 s8, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 ; 4-byte Folded Spill @@ -2425,14 +2425,14 @@ ; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s10 +; GCN-NEXT: s_mov_b32 s33, s8 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_call_v16bf16: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s10, s33 +; GFX7-NEXT: s_mov_b32 s8, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_store_dword v17, off, s[0:3], s33 ; 4-byte Folded Spill @@ -2515,14 +2515,14 @@ ; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 -; GFX7-NEXT: s_mov_b32 s33, s10 +; GFX7-NEXT: s_mov_b32 s33, s8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_call_v16bf16: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s8, s33 +; GFX8-NEXT: s_mov_b32 s6, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill @@ -2597,14 +2597,14 @@ ; GFX8-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 -; GFX8-NEXT: s_mov_b32 s33, s8 +; GFX8-NEXT: s_mov_b32 s33, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_call_v16bf16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s33 +; GFX9-NEXT: s_mov_b32 s6, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill @@ -2656,14 +2656,14 @@ ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: s_mov_b32 s33, s8 +; GFX9-NEXT: s_mov_b32 s33, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_call_v16bf16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s7, s33 +; GFX10-NEXT: s_mov_b32 s6, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill @@ -2717,7 +2717,7 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: s_mov_b32 s33, s7 +; GFX10-NEXT: s_mov_b32 s33, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll --- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll @@ -8,12 +8,12 @@ @alias = hidden alias void (), ptr @aliasee_default ; ALL-LABEL: {{^}}kernel: -; GFX908: .amdhsa_next_free_vgpr 41 -; GFX908-NEXT: .amdhsa_next_free_sgpr 33 +; GFX908: .amdhsa_next_free_vgpr 32 +; GFX908-NEXT: .amdhsa_next_free_sgpr 36 -; GFX90A: .amdhsa_next_free_vgpr 71 -; GFX90A-NEXT: .amdhsa_next_free_sgpr 33 -; GFX90A-NEXT: .amdhsa_accum_offset 44 +; GFX90A: .amdhsa_next_free_vgpr 65 +; GFX90A-NEXT: .amdhsa_next_free_sgpr 36 +; GFX90A-NEXT: .amdhsa_accum_offset 32 define amdgpu_kernel void @kernel() #0 { bb: call void @alias() #2 diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll --- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll +++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll @@ -8,7 +8,7 @@ ; CHECK-LABEL: {{^}}kernel0: ; CHECK: .amdhsa_next_free_vgpr 53 -; CHECK-NEXT: .amdhsa_next_free_sgpr 33 +; CHECK-NEXT: .amdhsa_next_free_sgpr 36 define amdgpu_kernel void @kernel0() #0 { bb: call void @alias0() #2 diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll --- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll +++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll @@ -9,8 +9,8 @@ ; The parent kernel has a higher VGPR usage than the possible callees. ; CHECK-LABEL: {{^}}kernel1: -; CHECK: .amdhsa_next_free_vgpr 42 -; CHECK-NEXT: .amdhsa_next_free_sgpr 33 +; CHECK: .amdhsa_next_free_vgpr 41 +; CHECK-NEXT: .amdhsa_next_free_sgpr 36 define amdgpu_kernel void @kernel1() #0 { bb: call void asm sideeffect "; clobber v40 ", "~{v40}"() diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll --- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll +++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll @@ -8,7 +8,7 @@ ; CHECK-LABEL: {{^}}kernel2: ; CHECK: .amdhsa_next_free_vgpr 53 -; CHECK-NEXT: .amdhsa_next_free_sgpr 33 +; CHECK-NEXT: .amdhsa_next_free_sgpr 36 define amdgpu_kernel void @kernel2() #0 { bb: call void @alias2() #2 diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll --- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll +++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll @@ -8,7 +8,7 @@ ; CHECK-LABEL: {{^}}kernel3: ; CHECK: .amdhsa_next_free_vgpr 253 -; CHECK-NEXT: .amdhsa_next_free_sgpr 33 +; CHECK-NEXT: .amdhsa_next_free_sgpr 36 define amdgpu_kernel void @kernel3() #0 { bb: call void @alias3() #2 diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll --- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll @@ -5821,7 +5821,6 @@ ; VI-NEXT: s_mov_b32 s33, s32 ; VI-NEXT: s_or_saveexec_b64 s[8:9], -1 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[8:9] ; VI-NEXT: s_addk_i32 s32, 0x400 ; VI-NEXT: v_mov_b32_e32 v0, 11 @@ -5831,6 +5830,7 @@ ; VI-NEXT: v_mov_b32_e32 v0, 13 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; VI-NEXT: v_mov_b32_e32 v0, 14 +; VI-NEXT: v_writelane_b32 v40, s4, 2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 ; VI-NEXT: v_mov_b32_e32 v0, 15 ; VI-NEXT: v_writelane_b32 v40, s30, 0 @@ -5866,7 +5866,6 @@ ; VI-NEXT: v_mov_b32_e32 v28, 9 ; VI-NEXT: v_mov_b32_e32 v29, 9 ; VI-NEXT: v_mov_b32_e32 v30, 10 -; VI-NEXT: v_writelane_b32 v41, s4, 0 ; VI-NEXT: v_writelane_b32 v40, s31, 1 ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_12xv3i32@rel32@lo+4 @@ -5874,10 +5873,9 @@ ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: v_readlane_b32 s31, v40, 1 ; VI-NEXT: v_readlane_b32 s30, v40, 0 -; VI-NEXT: v_readlane_b32 s4, v41, 0 +; VI-NEXT: v_readlane_b32 s4, v40, 2 ; VI-NEXT: s_or_saveexec_b64 s[6:7], -1 ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[6:7] ; VI-NEXT: s_addk_i32 s32, 0xfc00 ; VI-NEXT: s_mov_b32 s33, s4 @@ -5891,7 +5889,6 @@ ; CI-NEXT: s_mov_b32 s33, s32 ; CI-NEXT: s_or_saveexec_b64 s[8:9], -1 ; CI-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; CI-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; CI-NEXT: s_mov_b64 exec, s[8:9] ; CI-NEXT: s_addk_i32 s32, 0x400 ; CI-NEXT: v_mov_b32_e32 v0, 11 @@ -5901,6 +5898,7 @@ ; CI-NEXT: v_mov_b32_e32 v0, 13 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; CI-NEXT: v_mov_b32_e32 v0, 14 +; CI-NEXT: v_writelane_b32 v40, s4, 2 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 ; CI-NEXT: v_mov_b32_e32 v0, 15 ; CI-NEXT: v_writelane_b32 v40, s30, 0 @@ -5936,7 +5934,6 @@ ; CI-NEXT: v_mov_b32_e32 v28, 9 ; CI-NEXT: v_mov_b32_e32 v29, 9 ; CI-NEXT: v_mov_b32_e32 v30, 10 -; CI-NEXT: v_writelane_b32 v41, s4, 0 ; CI-NEXT: v_writelane_b32 v40, s31, 1 ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_12xv3i32@rel32@lo+4 @@ -5944,10 +5941,9 @@ ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: v_readlane_b32 s31, v40, 1 ; CI-NEXT: v_readlane_b32 s30, v40, 0 -; CI-NEXT: v_readlane_b32 s4, v41, 0 +; CI-NEXT: v_readlane_b32 s4, v40, 2 ; CI-NEXT: s_or_saveexec_b64 s[6:7], -1 ; CI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; CI-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; CI-NEXT: s_mov_b64 exec, s[6:7] ; CI-NEXT: s_addk_i32 s32, 0xfc00 ; CI-NEXT: s_mov_b32 s33, s4 @@ -5961,7 +5957,6 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[8:9] ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v0, 11 @@ -5971,6 +5966,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 13 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, 14 +; GFX9-NEXT: v_writelane_b32 v40, s4, 2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -6006,7 +6002,6 @@ ; GFX9-NEXT: v_mov_b32_e32 v28, 9 ; GFX9-NEXT: v_mov_b32_e32 v29, 9 ; GFX9-NEXT: v_mov_b32_e32 v30, 10 -; GFX9-NEXT: v_writelane_b32 v41, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_12xv3i32@rel32@lo+4 @@ -6014,10 +6009,9 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s4, v41, 0 +; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 @@ -6030,17 +6024,15 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: v_dual_mov_b32 v0, 11 :: v_dual_mov_b32 v1, 12 ; GFX11-NEXT: v_dual_mov_b32 v2, 13 :: v_dual_mov_b32 v3, 14 ; GFX11-NEXT: v_mov_b32_e32 v4, 15 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 -; GFX11-NEXT: s_add_i32 s0, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_add_i32 s0, s32, 16 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 ; GFX11-NEXT: scratch_store_b32 off, v4, s0 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, 0 @@ -6067,11 +6059,9 @@ ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -6085,7 +6075,6 @@ ; HSA-NEXT: s_mov_b32 s33, s32 ; HSA-NEXT: s_or_saveexec_b64 s[8:9], -1 ; HSA-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; HSA-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; HSA-NEXT: s_mov_b64 exec, s[8:9] ; HSA-NEXT: s_addk_i32 s32, 0x400 ; HSA-NEXT: v_mov_b32_e32 v0, 11 @@ -6095,6 +6084,7 @@ ; HSA-NEXT: v_mov_b32_e32 v0, 13 ; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; HSA-NEXT: v_mov_b32_e32 v0, 14 +; HSA-NEXT: v_writelane_b32 v40, s4, 2 ; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 ; HSA-NEXT: v_mov_b32_e32 v0, 15 ; HSA-NEXT: v_writelane_b32 v40, s30, 0 @@ -6130,7 +6120,6 @@ ; HSA-NEXT: v_mov_b32_e32 v28, 9 ; HSA-NEXT: v_mov_b32_e32 v29, 9 ; HSA-NEXT: v_mov_b32_e32 v30, 10 -; HSA-NEXT: v_writelane_b32 v41, s4, 0 ; HSA-NEXT: v_writelane_b32 v40, s31, 1 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_12xv3i32@rel32@lo+4 @@ -6138,10 +6127,9 @@ ; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] ; HSA-NEXT: v_readlane_b32 s31, v40, 1 ; HSA-NEXT: v_readlane_b32 s30, v40, 0 -; HSA-NEXT: v_readlane_b32 s4, v41, 0 +; HSA-NEXT: v_readlane_b32 s4, v40, 2 ; HSA-NEXT: s_or_saveexec_b64 s[6:7], -1 ; HSA-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; HSA-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; HSA-NEXT: s_mov_b64 exec, s[6:7] ; HSA-NEXT: s_addk_i32 s32, 0xfc00 ; HSA-NEXT: s_mov_b32 s33, s4 @@ -6172,7 +6160,6 @@ ; VI-NEXT: s_mov_b32 s33, s32 ; VI-NEXT: s_or_saveexec_b64 s[8:9], -1 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[8:9] ; VI-NEXT: s_addk_i32 s32, 0x400 ; VI-NEXT: v_mov_b32_e32 v0, 0x41300000 @@ -6182,6 +6169,7 @@ ; VI-NEXT: v_mov_b32_e32 v0, 0x41500000 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; VI-NEXT: v_mov_b32_e32 v0, 0x41600000 +; VI-NEXT: v_writelane_b32 v40, s4, 2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 ; VI-NEXT: v_mov_b32_e32 v0, 0x41700000 ; VI-NEXT: v_writelane_b32 v40, s30, 0 @@ -6217,7 +6205,6 @@ ; VI-NEXT: v_mov_b32_e32 v28, 0x41100000 ; VI-NEXT: v_mov_b32_e32 v29, 0x41100000 ; VI-NEXT: v_mov_b32_e32 v30, 0x41200000 -; VI-NEXT: v_writelane_b32 v41, s4, 0 ; VI-NEXT: v_writelane_b32 v40, s31, 1 ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_12xv3f32@rel32@lo+4 @@ -6225,10 +6212,9 @@ ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: v_readlane_b32 s31, v40, 1 ; VI-NEXT: v_readlane_b32 s30, v40, 0 -; VI-NEXT: v_readlane_b32 s4, v41, 0 +; VI-NEXT: v_readlane_b32 s4, v40, 2 ; VI-NEXT: s_or_saveexec_b64 s[6:7], -1 ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[6:7] ; VI-NEXT: s_addk_i32 s32, 0xfc00 ; VI-NEXT: s_mov_b32 s33, s4 @@ -6242,7 +6228,6 @@ ; CI-NEXT: s_mov_b32 s33, s32 ; CI-NEXT: s_or_saveexec_b64 s[8:9], -1 ; CI-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; CI-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; CI-NEXT: s_mov_b64 exec, s[8:9] ; CI-NEXT: s_addk_i32 s32, 0x400 ; CI-NEXT: v_mov_b32_e32 v0, 0x41300000 @@ -6252,6 +6237,7 @@ ; CI-NEXT: v_mov_b32_e32 v0, 0x41500000 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; CI-NEXT: v_mov_b32_e32 v0, 0x41600000 +; CI-NEXT: v_writelane_b32 v40, s4, 2 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 ; CI-NEXT: v_mov_b32_e32 v0, 0x41700000 ; CI-NEXT: v_writelane_b32 v40, s30, 0 @@ -6287,7 +6273,6 @@ ; CI-NEXT: v_mov_b32_e32 v28, 0x41100000 ; CI-NEXT: v_mov_b32_e32 v29, 0x41100000 ; CI-NEXT: v_mov_b32_e32 v30, 0x41200000 -; CI-NEXT: v_writelane_b32 v41, s4, 0 ; CI-NEXT: v_writelane_b32 v40, s31, 1 ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_12xv3f32@rel32@lo+4 @@ -6295,10 +6280,9 @@ ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: v_readlane_b32 s31, v40, 1 ; CI-NEXT: v_readlane_b32 s30, v40, 0 -; CI-NEXT: v_readlane_b32 s4, v41, 0 +; CI-NEXT: v_readlane_b32 s4, v40, 2 ; CI-NEXT: s_or_saveexec_b64 s[6:7], -1 ; CI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; CI-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; CI-NEXT: s_mov_b64 exec, s[6:7] ; CI-NEXT: s_addk_i32 s32, 0xfc00 ; CI-NEXT: s_mov_b32 s33, s4 @@ -6312,7 +6296,6 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[8:9] ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x41300000 @@ -6322,6 +6305,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 0x41500000 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x41600000 +; GFX9-NEXT: v_writelane_b32 v40, s4, 2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x41700000 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -6357,7 +6341,6 @@ ; GFX9-NEXT: v_mov_b32_e32 v28, 0x41100000 ; GFX9-NEXT: v_mov_b32_e32 v29, 0x41100000 ; GFX9-NEXT: v_mov_b32_e32 v30, 0x41200000 -; GFX9-NEXT: v_writelane_b32 v41, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_12xv3f32@rel32@lo+4 @@ -6365,10 +6348,9 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s4, v41, 0 +; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 @@ -6381,19 +6363,17 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x41300000 ; GFX11-NEXT: v_mov_b32_e32 v1, 0x41400000 ; GFX11-NEXT: v_mov_b32_e32 v2, 0x41500000 ; GFX11-NEXT: v_mov_b32_e32 v3, 0x41600000 ; GFX11-NEXT: v_dual_mov_b32 v4, 0x41700000 :: v_dual_mov_b32 v5, 1.0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 -; GFX11-NEXT: s_add_i32 s0, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_add_i32 s0, s32, 16 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 ; GFX11-NEXT: scratch_store_b32 off, v4, s0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0 @@ -6422,11 +6402,9 @@ ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -6440,7 +6418,6 @@ ; HSA-NEXT: s_mov_b32 s33, s32 ; HSA-NEXT: s_or_saveexec_b64 s[8:9], -1 ; HSA-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; HSA-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; HSA-NEXT: s_mov_b64 exec, s[8:9] ; HSA-NEXT: s_addk_i32 s32, 0x400 ; HSA-NEXT: v_mov_b32_e32 v0, 0x41300000 @@ -6450,6 +6427,7 @@ ; HSA-NEXT: v_mov_b32_e32 v0, 0x41500000 ; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; HSA-NEXT: v_mov_b32_e32 v0, 0x41600000 +; HSA-NEXT: v_writelane_b32 v40, s4, 2 ; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 ; HSA-NEXT: v_mov_b32_e32 v0, 0x41700000 ; HSA-NEXT: v_writelane_b32 v40, s30, 0 @@ -6485,7 +6463,6 @@ ; HSA-NEXT: v_mov_b32_e32 v28, 0x41100000 ; HSA-NEXT: v_mov_b32_e32 v29, 0x41100000 ; HSA-NEXT: v_mov_b32_e32 v30, 0x41200000 -; HSA-NEXT: v_writelane_b32 v41, s4, 0 ; HSA-NEXT: v_writelane_b32 v40, s31, 1 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_12xv3f32@rel32@lo+4 @@ -6493,10 +6470,9 @@ ; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] ; HSA-NEXT: v_readlane_b32 s31, v40, 1 ; HSA-NEXT: v_readlane_b32 s30, v40, 0 -; HSA-NEXT: v_readlane_b32 s4, v41, 0 +; HSA-NEXT: v_readlane_b32 s4, v40, 2 ; HSA-NEXT: s_or_saveexec_b64 s[6:7], -1 ; HSA-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; HSA-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; HSA-NEXT: s_mov_b64 exec, s[6:7] ; HSA-NEXT: s_addk_i32 s32, 0xfc00 ; HSA-NEXT: s_mov_b32 s33, s4 @@ -6527,7 +6503,6 @@ ; VI-NEXT: s_mov_b32 s33, s32 ; VI-NEXT: s_or_saveexec_b64 s[8:9], -1 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[8:9] ; VI-NEXT: s_addk_i32 s32, 0x400 ; VI-NEXT: v_mov_b32_e32 v0, 7 @@ -6545,6 +6520,7 @@ ; VI-NEXT: v_mov_b32_e32 v0, 13 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; VI-NEXT: v_mov_b32_e32 v0, 14 +; VI-NEXT: v_writelane_b32 v40, s4, 2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 ; VI-NEXT: v_mov_b32_e32 v0, 15 ; VI-NEXT: v_writelane_b32 v40, s30, 0 @@ -6580,7 +6556,6 @@ ; VI-NEXT: v_mov_b32_e32 v28, 5 ; VI-NEXT: v_mov_b32_e32 v29, 5 ; VI-NEXT: v_mov_b32_e32 v30, 6 -; VI-NEXT: v_writelane_b32 v41, s4, 0 ; VI-NEXT: v_writelane_b32 v40, s31, 1 ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_8xv5i32@rel32@lo+4 @@ -6588,10 +6563,9 @@ ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: v_readlane_b32 s31, v40, 1 ; VI-NEXT: v_readlane_b32 s30, v40, 0 -; VI-NEXT: v_readlane_b32 s4, v41, 0 +; VI-NEXT: v_readlane_b32 s4, v40, 2 ; VI-NEXT: s_or_saveexec_b64 s[6:7], -1 ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[6:7] ; VI-NEXT: s_addk_i32 s32, 0xfc00 ; VI-NEXT: s_mov_b32 s33, s4 @@ -6605,7 +6579,6 @@ ; CI-NEXT: s_mov_b32 s33, s32 ; CI-NEXT: s_or_saveexec_b64 s[8:9], -1 ; CI-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; CI-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; CI-NEXT: s_mov_b64 exec, s[8:9] ; CI-NEXT: s_addk_i32 s32, 0x400 ; CI-NEXT: v_mov_b32_e32 v0, 7 @@ -6623,6 +6596,7 @@ ; CI-NEXT: v_mov_b32_e32 v0, 13 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; CI-NEXT: v_mov_b32_e32 v0, 14 +; CI-NEXT: v_writelane_b32 v40, s4, 2 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 ; CI-NEXT: v_mov_b32_e32 v0, 15 ; CI-NEXT: v_writelane_b32 v40, s30, 0 @@ -6658,7 +6632,6 @@ ; CI-NEXT: v_mov_b32_e32 v28, 5 ; CI-NEXT: v_mov_b32_e32 v29, 5 ; CI-NEXT: v_mov_b32_e32 v30, 6 -; CI-NEXT: v_writelane_b32 v41, s4, 0 ; CI-NEXT: v_writelane_b32 v40, s31, 1 ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_8xv5i32@rel32@lo+4 @@ -6666,10 +6639,9 @@ ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: v_readlane_b32 s31, v40, 1 ; CI-NEXT: v_readlane_b32 s30, v40, 0 -; CI-NEXT: v_readlane_b32 s4, v41, 0 +; CI-NEXT: v_readlane_b32 s4, v40, 2 ; CI-NEXT: s_or_saveexec_b64 s[6:7], -1 ; CI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; CI-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; CI-NEXT: s_mov_b64 exec, s[6:7] ; CI-NEXT: s_addk_i32 s32, 0xfc00 ; CI-NEXT: s_mov_b32 s33, s4 @@ -6683,7 +6655,6 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[8:9] ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v0, 7 @@ -6701,6 +6672,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 13 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; GFX9-NEXT: v_mov_b32_e32 v0, 14 +; GFX9-NEXT: v_writelane_b32 v40, s4, 2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -6736,7 +6708,6 @@ ; GFX9-NEXT: v_mov_b32_e32 v28, 5 ; GFX9-NEXT: v_mov_b32_e32 v29, 5 ; GFX9-NEXT: v_mov_b32_e32 v30, 6 -; GFX9-NEXT: v_writelane_b32 v41, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_8xv5i32@rel32@lo+4 @@ -6744,10 +6715,9 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s4, v41, 0 +; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 @@ -6760,17 +6730,15 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: v_dual_mov_b32 v0, 7 :: v_dual_mov_b32 v1, 8 ; GFX11-NEXT: v_dual_mov_b32 v2, 9 :: v_dual_mov_b32 v3, 10 ; GFX11-NEXT: v_dual_mov_b32 v8, 15 :: v_dual_mov_b32 v5, 12 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_dual_mov_b32 v4, 11 :: v_dual_mov_b32 v7, 14 ; GFX11-NEXT: v_mov_b32_e32 v6, 13 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_add_i32 s0, s32, 32 ; GFX11-NEXT: s_add_i32 s1, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 @@ -6802,11 +6770,9 @@ ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -6820,7 +6786,6 @@ ; HSA-NEXT: s_mov_b32 s33, s32 ; HSA-NEXT: s_or_saveexec_b64 s[8:9], -1 ; HSA-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; HSA-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; HSA-NEXT: s_mov_b64 exec, s[8:9] ; HSA-NEXT: s_addk_i32 s32, 0x400 ; HSA-NEXT: v_mov_b32_e32 v0, 7 @@ -6838,6 +6803,7 @@ ; HSA-NEXT: v_mov_b32_e32 v0, 13 ; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; HSA-NEXT: v_mov_b32_e32 v0, 14 +; HSA-NEXT: v_writelane_b32 v40, s4, 2 ; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 ; HSA-NEXT: v_mov_b32_e32 v0, 15 ; HSA-NEXT: v_writelane_b32 v40, s30, 0 @@ -6873,7 +6839,6 @@ ; HSA-NEXT: v_mov_b32_e32 v28, 5 ; HSA-NEXT: v_mov_b32_e32 v29, 5 ; HSA-NEXT: v_mov_b32_e32 v30, 6 -; HSA-NEXT: v_writelane_b32 v41, s4, 0 ; HSA-NEXT: v_writelane_b32 v40, s31, 1 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_8xv5i32@rel32@lo+4 @@ -6881,10 +6846,9 @@ ; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] ; HSA-NEXT: v_readlane_b32 s31, v40, 1 ; HSA-NEXT: v_readlane_b32 s30, v40, 0 -; HSA-NEXT: v_readlane_b32 s4, v41, 0 +; HSA-NEXT: v_readlane_b32 s4, v40, 2 ; HSA-NEXT: s_or_saveexec_b64 s[6:7], -1 ; HSA-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; HSA-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; HSA-NEXT: s_mov_b64 exec, s[6:7] ; HSA-NEXT: s_addk_i32 s32, 0xfc00 ; HSA-NEXT: s_mov_b32 s33, s4 @@ -6911,7 +6875,6 @@ ; VI-NEXT: s_mov_b32 s33, s32 ; VI-NEXT: s_or_saveexec_b64 s[8:9], -1 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[8:9] ; VI-NEXT: s_addk_i32 s32, 0x400 ; VI-NEXT: v_mov_b32_e32 v0, 0x40e00000 @@ -6929,6 +6892,7 @@ ; VI-NEXT: v_mov_b32_e32 v0, 0x41500000 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; VI-NEXT: v_mov_b32_e32 v0, 0x41600000 +; VI-NEXT: v_writelane_b32 v40, s4, 2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 ; VI-NEXT: v_mov_b32_e32 v0, 0x41700000 ; VI-NEXT: v_writelane_b32 v40, s30, 0 @@ -6964,7 +6928,6 @@ ; VI-NEXT: v_mov_b32_e32 v28, 0x40a00000 ; VI-NEXT: v_mov_b32_e32 v29, 0x40a00000 ; VI-NEXT: v_mov_b32_e32 v30, 0x40c00000 -; VI-NEXT: v_writelane_b32 v41, s4, 0 ; VI-NEXT: v_writelane_b32 v40, s31, 1 ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_8xv5f32@rel32@lo+4 @@ -6972,10 +6935,9 @@ ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: v_readlane_b32 s31, v40, 1 ; VI-NEXT: v_readlane_b32 s30, v40, 0 -; VI-NEXT: v_readlane_b32 s4, v41, 0 +; VI-NEXT: v_readlane_b32 s4, v40, 2 ; VI-NEXT: s_or_saveexec_b64 s[6:7], -1 ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[6:7] ; VI-NEXT: s_addk_i32 s32, 0xfc00 ; VI-NEXT: s_mov_b32 s33, s4 @@ -6989,7 +6951,6 @@ ; CI-NEXT: s_mov_b32 s33, s32 ; CI-NEXT: s_or_saveexec_b64 s[8:9], -1 ; CI-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; CI-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; CI-NEXT: s_mov_b64 exec, s[8:9] ; CI-NEXT: s_addk_i32 s32, 0x400 ; CI-NEXT: v_mov_b32_e32 v0, 0x40e00000 @@ -7007,6 +6968,7 @@ ; CI-NEXT: v_mov_b32_e32 v0, 0x41500000 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; CI-NEXT: v_mov_b32_e32 v0, 0x41600000 +; CI-NEXT: v_writelane_b32 v40, s4, 2 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 ; CI-NEXT: v_mov_b32_e32 v0, 0x41700000 ; CI-NEXT: v_writelane_b32 v40, s30, 0 @@ -7042,7 +7004,6 @@ ; CI-NEXT: v_mov_b32_e32 v28, 0x40a00000 ; CI-NEXT: v_mov_b32_e32 v29, 0x40a00000 ; CI-NEXT: v_mov_b32_e32 v30, 0x40c00000 -; CI-NEXT: v_writelane_b32 v41, s4, 0 ; CI-NEXT: v_writelane_b32 v40, s31, 1 ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_8xv5f32@rel32@lo+4 @@ -7050,10 +7011,9 @@ ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: v_readlane_b32 s31, v40, 1 ; CI-NEXT: v_readlane_b32 s30, v40, 0 -; CI-NEXT: v_readlane_b32 s4, v41, 0 +; CI-NEXT: v_readlane_b32 s4, v40, 2 ; CI-NEXT: s_or_saveexec_b64 s[6:7], -1 ; CI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; CI-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; CI-NEXT: s_mov_b64 exec, s[6:7] ; CI-NEXT: s_addk_i32 s32, 0xfc00 ; CI-NEXT: s_mov_b32 s33, s4 @@ -7067,7 +7027,6 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[8:9] ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x40e00000 @@ -7085,6 +7044,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 0x41500000 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x41600000 +; GFX9-NEXT: v_writelane_b32 v40, s4, 2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x41700000 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -7120,7 +7080,6 @@ ; GFX9-NEXT: v_mov_b32_e32 v28, 0x40a00000 ; GFX9-NEXT: v_mov_b32_e32 v29, 0x40a00000 ; GFX9-NEXT: v_mov_b32_e32 v30, 0x40c00000 -; GFX9-NEXT: v_writelane_b32 v41, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_8xv5f32@rel32@lo+4 @@ -7128,10 +7087,9 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s4, v41, 0 +; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 @@ -7144,10 +7102,9 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x40e00000 ; GFX11-NEXT: v_mov_b32_e32 v1, 0x41000000 ; GFX11-NEXT: v_mov_b32_e32 v2, 0x41100000 @@ -7158,7 +7115,6 @@ ; GFX11-NEXT: v_mov_b32_e32 v5, 0x41400000 ; GFX11-NEXT: v_dual_mov_b32 v6, 0x41500000 :: v_dual_mov_b32 v9, 1.0 ; GFX11-NEXT: v_mov_b32_e32 v7, 0x41600000 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_add_i32 s0, s32, 32 ; GFX11-NEXT: s_add_i32 s1, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 @@ -7189,11 +7145,9 @@ ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -7207,7 +7161,6 @@ ; HSA-NEXT: s_mov_b32 s33, s32 ; HSA-NEXT: s_or_saveexec_b64 s[8:9], -1 ; HSA-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; HSA-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; HSA-NEXT: s_mov_b64 exec, s[8:9] ; HSA-NEXT: s_addk_i32 s32, 0x400 ; HSA-NEXT: v_mov_b32_e32 v0, 0x40e00000 @@ -7225,6 +7178,7 @@ ; HSA-NEXT: v_mov_b32_e32 v0, 0x41500000 ; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; HSA-NEXT: v_mov_b32_e32 v0, 0x41600000 +; HSA-NEXT: v_writelane_b32 v40, s4, 2 ; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 ; HSA-NEXT: v_mov_b32_e32 v0, 0x41700000 ; HSA-NEXT: v_writelane_b32 v40, s30, 0 @@ -7260,7 +7214,6 @@ ; HSA-NEXT: v_mov_b32_e32 v28, 0x40a00000 ; HSA-NEXT: v_mov_b32_e32 v29, 0x40a00000 ; HSA-NEXT: v_mov_b32_e32 v30, 0x40c00000 -; HSA-NEXT: v_writelane_b32 v41, s4, 0 ; HSA-NEXT: v_writelane_b32 v40, s31, 1 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_8xv5f32@rel32@lo+4 @@ -7268,10 +7221,9 @@ ; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] ; HSA-NEXT: v_readlane_b32 s31, v40, 1 ; HSA-NEXT: v_readlane_b32 s30, v40, 0 -; HSA-NEXT: v_readlane_b32 s4, v41, 0 +; HSA-NEXT: v_readlane_b32 s4, v40, 2 ; HSA-NEXT: s_or_saveexec_b64 s[6:7], -1 ; HSA-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; HSA-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; HSA-NEXT: s_mov_b64 exec, s[6:7] ; HSA-NEXT: s_addk_i32 s32, 0xfc00 ; HSA-NEXT: s_mov_b32 s33, s4 diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll --- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll +++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll @@ -15,17 +15,17 @@ ; GCN-LABEL: {{^}}indirect_use_vcc: ; GCN: s_mov_b32 s4, s33 -; GCN: v_writelane_b32 v41, s4, 0 +; GCN: v_writelane_b32 v40, s4, 2 ; GCN: v_writelane_b32 v40, s30, 0 ; GCN: v_writelane_b32 v40, s31, 1 ; GCN: s_swappc_b64 ; GCN: v_readlane_b32 s31, v40, 1 ; GCN: v_readlane_b32 s30, v40, 0 -; GCN: v_readlane_b32 s4, v41, 0 +; GCN: v_readlane_b32 s4, v40, 2 ; GCN: s_mov_b32 s33, s4 ; GCN: s_setpc_b64 s[30:31] ; GCN: ; NumSgprs: 36 -; GCN: ; NumVgprs: 42 +; GCN: ; NumVgprs: 41 define void @indirect_use_vcc() #1 { call void @use_vcc() ret void @@ -36,7 +36,7 @@ ; CI: ; NumSgprs: 38 ; VI-NOBUG: ; NumSgprs: 40 ; VI-BUG: ; NumSgprs: 96 -; GCN: ; NumVgprs: 42 +; GCN: ; NumVgprs: 41 define amdgpu_kernel void @indirect_2level_use_vcc_kernel(ptr addrspace(1) %out) #0 { call void @indirect_use_vcc() ret void @@ -54,7 +54,7 @@ ; GCN-LABEL: {{^}}indirect_use_flat_scratch: ; CI: ; NumSgprs: 38 ; VI: ; NumSgprs: 40 -; GCN: ; NumVgprs: 42 +; GCN: ; NumVgprs: 41 define void @indirect_use_flat_scratch() #1 { call void @use_flat_scratch() ret void @@ -65,7 +65,7 @@ ; CI: ; NumSgprs: 38 ; VI-NOBUG: ; NumSgprs: 40 ; VI-BUG: ; NumSgprs: 96 -; GCN: ; NumVgprs: 42 +; GCN: ; NumVgprs: 41 define amdgpu_kernel void @indirect_2level_use_flat_scratch_kernel(ptr addrspace(1) %out) #0 { call void @indirect_use_flat_scratch() ret void @@ -80,7 +80,7 @@ } ; GCN-LABEL: {{^}}indirect_use_10_vgpr: -; GCN: ; NumVgprs: 42 +; GCN: ; NumVgprs: 41 define void @indirect_use_10_vgpr() #0 { call void @use_10_vgpr() ret void @@ -88,7 +88,7 @@ ; GCN-LABEL: {{^}}indirect_2_level_use_10_vgpr: ; GCN: is_dynamic_callstack = 0 -; GCN: ; NumVgprs: 42 +; GCN: ; NumVgprs: 41 define amdgpu_kernel void @indirect_2_level_use_10_vgpr() #0 { call void @indirect_use_10_vgpr() ret void diff --git a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll --- a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll @@ -24,13 +24,11 @@ ; GCN-LABEL: {{^}}test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void: ; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 ; MUBUF: buffer_store_dword -; MUBUF: buffer_store_dword -; FLATSCR: scratch_store_dword ; FLATSCR: scratch_store_dword +; GCN: v_writelane_b32 v40, [[FP_SCRATCH_COPY]], 4 ; GCN: v_writelane_b32 v40, s30, 0 ; GCN: v_writelane_b32 v40, s31, 1 ; GCN: v_writelane_b32 v40, s34, 2 -; GCN: v_writelane_b32 v41, [[FP_SCRATCH_COPY]], 0 ; GCN: v_writelane_b32 v40, s35, 3 ; GCN: s_swappc_b64 @@ -44,10 +42,8 @@ ; FLATSCR-DAG: v_readlane_b32 s31, v40, 1 ; FLATSCR-DAG: v_readlane_b32 s30, v40, 0 -; GCN: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v41, 0 +; GCN: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v40, 4 ; MUBUF: buffer_load_dword -; MUBUF: buffer_load_dword -; FLATSCR: scratch_load_dword ; FLATSCR: scratch_load_dword ; GCN: s_mov_b32 s33, [[FP_SCRATCH_COPY]] ; GCN: s_setpc_b64 s[30:31] @@ -62,21 +58,17 @@ ; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 ; GCN: s_mov_b32 s33, s32 ; MUBUF: buffer_store_dword v40 -; MUBUF: buffer_store_dword v41 ; FLATSCR: scratch_store_dword off, v40 -; FLATSCR: scratch_store_dword off, v41 +; GCN: v_writelane_b32 v40, [[FP_SCRATCH_COPY]], 4 ; MUBUF: s_addk_i32 s32, 0x400 ; FLATSCR: s_add_i32 s32, s32, 16 -; GCN: v_writelane_b32 v41, [[FP_SCRATCH_COPY]], 0 ; GCN: s_swappc_b64 ; GCN-NEXT: s_swappc_b64 -; GCN: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v41, 0 +; GCN: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v40, 4 ; MUBUF: buffer_load_dword v40 -; MUBUF: buffer_load_dword v41 ; FLATSCR: scratch_load_dword v40 -; FLATSCR: scratch_load_dword v41 ; GCN: s_mov_b32 s33, [[FP_SCRATCH_COPY]] define void @test_func_call_external_void_funcx2() #0 { call void @external_void_func_void() diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll --- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll @@ -89,15 +89,13 @@ ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR_1:v[0-9]+]], off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s33 offset:4 ; 4-byte Folded Spill -; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR_1:v[0-9]+]], s33 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] +; GCN: v_writelane_b32 [[CSR_VGPR]], [[FP_SCRATCH_COPY]], 2 ; MUBUF-DAG: s_addk_i32 s32, 0x400{{$}} ; FLATSCR-DAG: s_add_i32 s32, s32, 16{{$}} ; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s30, ; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; GCN: v_writelane_b32 [[CSR_VGPR_1]], [[FP_SCRATCH_COPY]], 0 ; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31, ; MUBUF-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s33{{$}} @@ -108,12 +106,10 @@ ; GCN-DAG: v_readlane_b32 s30, [[CSR_VGPR]] ; GCN-DAG: v_readlane_b32 s31, [[CSR_VGPR]] -; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[CSR_VGPR_1]], 0 +; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[CSR_VGPR]], 2 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR_1]], off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 offset:4 ; 4-byte Folded Reload -; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR_1]], off, s33 offset:8 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] ; MUBUF: s_addk_i32 s32, 0xfc00{{$}} ; FLATSCR: s_add_i32 s32, s32, -16{{$}} @@ -140,13 +136,11 @@ ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s33 ; 4-byte Folded Spill -; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR_1:v[0-9]+]], off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s33 ; 4-byte Folded Spill -; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR_1:v[0-9]+]], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] ; MUBUF-DAG: s_addk_i32 s32, 0x400 ; FLATSCR-DAG: s_add_i32 s32, s32, 16 -; GCN-DAG: v_writelane_b32 [[CSR_VGPR_1]], [[FP_SCRATCH_COPY]], [[FP_SPILL_LANE:[0-9]+]] +; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], [[FP_SCRATCH_COPY]], [[FP_SPILL_LANE:[0-9]+]] ; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s30, 0 ; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31, 1 @@ -155,12 +149,10 @@ ; GCN-DAG: v_readlane_b32 s30, [[CSR_VGPR]], 0 ; GCN-DAG: v_readlane_b32 s31, [[CSR_VGPR]], 1 -; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[CSR_VGPR_1]], [[FP_SPILL_LANE]] +; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[CSR_VGPR]], [[FP_SPILL_LANE]] ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 ; 4-byte Folded Reload -; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR_1]], off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 ; 4-byte Folded Reload -; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR_1]], off, s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] ; MUBUF: s_addk_i32 s32, 0xfc00 ; FLATSCR: s_add_i32 s32, s32, -16 @@ -631,14 +623,14 @@ ; Make sure that the FP save happens after restoring exec from the same ; register. ; GCN-LABEL: {{^}}callee_need_to_spill_fp_to_reg: -; FLATSCR: s_mov_b32 s2, s33 +; FLATSCR: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 ; FLATSCR: s_mov_b32 s33, s32 ; GCN-NOT: v_writelane_b32 v40, s33 -; FLATSCR: s_or_saveexec_b64 s[4:5], -1 -; FLATSCR: s_mov_b64 exec, s[4:5] -; FLATSCR: s_or_saveexec_b64 s[4:5], -1 +; FLATSCR: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} +; FLATSCR: s_mov_b64 exec, [[COPY_EXEC0]] +; FLATSCR: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GCN-NOT: v_readlane_b32 s33, v40 -; FLATSCR: s_mov_b32 s33, s2 +; FLATSCR: s_mov_b32 s33, [[FP_SCRATCH_COPY]] ; GCN: s_setpc_b64 define void @callee_need_to_spill_fp_to_reg() #1 { call void asm sideeffect "; clobber nonpreserved SGPRs and 64 CSRs", diff --git a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll --- a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll @@ -30,6 +30,13 @@ ; ; GCN_DBG-LABEL: test_loop: ; GCN_DBG: ; %bb.0: ; %entry +; GCN_DBG-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN_DBG-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN_DBG-NEXT: s_mov_b32 s14, -1 +; GCN_DBG-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN_DBG-NEXT: s_add_u32 s12, s12, s11 +; GCN_DBG-NEXT: s_addc_u32 s13, s13, 0 +; GCN_DBG-NEXT: ; implicit-def: $vgpr0 ; GCN_DBG-NEXT: s_load_dword s0, s[4:5], 0x9 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 0 @@ -39,11 +46,25 @@ ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) ; GCN_DBG-NEXT: s_cmp_lg_u32 s1, s2 ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 +; GCN_DBG-NEXT: s_mov_b64 s[6:7], exec +; GCN_DBG-NEXT: s_mov_b64 exec, -1 +; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] ; GCN_DBG-NEXT: s_cbranch_scc1 .LBB0_2 ; GCN_DBG-NEXT: ; %bb.1: ; %for.exit +; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN_DBG-NEXT: s_waitcnt expcnt(0) +; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] +; GCN_DBG-NEXT: ; kill: killed $vgpr0 ; GCN_DBG-NEXT: s_endpgm ; GCN_DBG-NEXT: .LBB0_2: ; %for.body ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN_DBG-NEXT: s_waitcnt expcnt(0) +; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] +; GCN_DBG-NEXT: s_waitcnt vmcnt(0) ; GCN_DBG-NEXT: v_readlane_b32 s0, v0, 1 ; GCN_DBG-NEXT: v_readlane_b32 s2, v0, 0 ; GCN_DBG-NEXT: s_mov_b32 s1, 2 @@ -65,8 +86,16 @@ ; GCN_DBG-NEXT: s_mov_b64 s[2:3], -1 ; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3] ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 +; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] ; GCN_DBG-NEXT: s_cbranch_vccnz .LBB0_2 ; GCN_DBG-NEXT: ; %bb.3: ; %DummyReturnBlock +; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN_DBG-NEXT: s_waitcnt expcnt(0) +; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] +; GCN_DBG-NEXT: ; kill: killed $vgpr0 ; GCN_DBG-NEXT: s_endpgm entry: %cmp = icmp eq i32 %n, -1 @@ -109,16 +138,36 @@ ; ; GCN_DBG-LABEL: loop_const_true: ; GCN_DBG: ; %bb.0: ; %entry +; GCN_DBG-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN_DBG-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN_DBG-NEXT: s_mov_b32 s14, -1 +; GCN_DBG-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN_DBG-NEXT: s_add_u32 s12, s12, s11 +; GCN_DBG-NEXT: s_addc_u32 s13, s13, 0 +; GCN_DBG-NEXT: ; implicit-def: $vgpr0 ; GCN_DBG-NEXT: s_load_dword s0, s[4:5], 0x9 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 0 ; GCN_DBG-NEXT: s_mov_b32 s0, 0 ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 +; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] ; GCN_DBG-NEXT: s_branch .LBB1_2 ; GCN_DBG-NEXT: .LBB1_1: ; %for.exit +; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN_DBG-NEXT: s_waitcnt expcnt(0) +; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] +; GCN_DBG-NEXT: ; kill: killed $vgpr0 ; GCN_DBG-NEXT: s_endpgm ; GCN_DBG-NEXT: .LBB1_2: ; %for.body ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN_DBG-NEXT: s_waitcnt expcnt(0) +; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] +; GCN_DBG-NEXT: s_waitcnt vmcnt(0) ; GCN_DBG-NEXT: v_readlane_b32 s0, v0, 1 ; GCN_DBG-NEXT: v_readlane_b32 s2, v0, 0 ; GCN_DBG-NEXT: s_mov_b32 s1, 2 @@ -140,6 +189,9 @@ ; GCN_DBG-NEXT: s_mov_b64 s[2:3], 0 ; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3] ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 +; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] ; GCN_DBG-NEXT: s_cbranch_vccnz .LBB1_1 ; GCN_DBG-NEXT: s_branch .LBB1_2 entry: @@ -174,16 +226,36 @@ ; ; GCN_DBG-LABEL: loop_const_false: ; GCN_DBG: ; %bb.0: ; %entry +; GCN_DBG-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN_DBG-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN_DBG-NEXT: s_mov_b32 s14, -1 +; GCN_DBG-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN_DBG-NEXT: s_add_u32 s12, s12, s11 +; GCN_DBG-NEXT: s_addc_u32 s13, s13, 0 +; GCN_DBG-NEXT: ; implicit-def: $vgpr0 ; GCN_DBG-NEXT: s_load_dword s0, s[4:5], 0x9 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 0 ; GCN_DBG-NEXT: s_mov_b32 s0, 0 ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 +; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] ; GCN_DBG-NEXT: s_branch .LBB2_2 ; GCN_DBG-NEXT: .LBB2_1: ; %for.exit +; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN_DBG-NEXT: s_waitcnt expcnt(0) +; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] +; GCN_DBG-NEXT: ; kill: killed $vgpr0 ; GCN_DBG-NEXT: s_endpgm ; GCN_DBG-NEXT: .LBB2_2: ; %for.body ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN_DBG-NEXT: s_waitcnt expcnt(0) +; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] +; GCN_DBG-NEXT: s_waitcnt vmcnt(0) ; GCN_DBG-NEXT: v_readlane_b32 s0, v0, 1 ; GCN_DBG-NEXT: v_readlane_b32 s2, v0, 0 ; GCN_DBG-NEXT: s_mov_b32 s1, 2 @@ -205,6 +277,9 @@ ; GCN_DBG-NEXT: s_mov_b64 s[2:3], -1 ; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3] ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 +; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] ; GCN_DBG-NEXT: s_cbranch_vccnz .LBB2_1 ; GCN_DBG-NEXT: s_branch .LBB2_2 entry: @@ -240,16 +315,36 @@ ; ; GCN_DBG-LABEL: loop_const_undef: ; GCN_DBG: ; %bb.0: ; %entry +; GCN_DBG-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN_DBG-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN_DBG-NEXT: s_mov_b32 s14, -1 +; GCN_DBG-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN_DBG-NEXT: s_add_u32 s12, s12, s11 +; GCN_DBG-NEXT: s_addc_u32 s13, s13, 0 +; GCN_DBG-NEXT: ; implicit-def: $vgpr0 ; GCN_DBG-NEXT: s_load_dword s0, s[4:5], 0x9 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 0 ; GCN_DBG-NEXT: s_mov_b32 s0, 0 ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 +; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] ; GCN_DBG-NEXT: s_branch .LBB3_2 ; GCN_DBG-NEXT: .LBB3_1: ; %for.exit +; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN_DBG-NEXT: s_waitcnt expcnt(0) +; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] +; GCN_DBG-NEXT: ; kill: killed $vgpr0 ; GCN_DBG-NEXT: s_endpgm ; GCN_DBG-NEXT: .LBB3_2: ; %for.body ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN_DBG-NEXT: s_waitcnt expcnt(0) +; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] +; GCN_DBG-NEXT: s_waitcnt vmcnt(0) ; GCN_DBG-NEXT: v_readlane_b32 s0, v0, 1 ; GCN_DBG-NEXT: v_readlane_b32 s2, v0, 0 ; GCN_DBG-NEXT: s_mov_b32 s1, 2 @@ -269,6 +364,9 @@ ; GCN_DBG-NEXT: s_mov_b32 s1, 1 ; GCN_DBG-NEXT: s_add_i32 s0, s0, s1 ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 +; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] ; GCN_DBG-NEXT: s_cbranch_scc1 .LBB3_1 ; GCN_DBG-NEXT: s_branch .LBB3_2 entry: @@ -318,6 +416,13 @@ ; ; GCN_DBG-LABEL: loop_arg_0: ; GCN_DBG: ; %bb.0: ; %entry +; GCN_DBG-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN_DBG-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN_DBG-NEXT: s_mov_b32 s14, -1 +; GCN_DBG-NEXT: s_mov_b32 s15, 0xe8f000 +; GCN_DBG-NEXT: s_add_u32 s12, s12, s11 +; GCN_DBG-NEXT: s_addc_u32 s13, s13, 0 +; GCN_DBG-NEXT: ; implicit-def: $vgpr0 ; GCN_DBG-NEXT: s_load_dword s0, s[4:5], 0x9 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 0 @@ -335,11 +440,24 @@ ; GCN_DBG-NEXT: v_writelane_b32 v0, s1, 2 ; GCN_DBG-NEXT: s_mov_b32 s0, 0 ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 3 +; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] ; GCN_DBG-NEXT: s_branch .LBB4_2 ; GCN_DBG-NEXT: .LBB4_1: ; %for.exit +; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN_DBG-NEXT: s_waitcnt expcnt(0) +; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] +; GCN_DBG-NEXT: ; kill: killed $vgpr0 ; GCN_DBG-NEXT: s_endpgm ; GCN_DBG-NEXT: .LBB4_2: ; %for.body ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN_DBG-NEXT: s_waitcnt expcnt(0) +; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] +; GCN_DBG-NEXT: s_waitcnt vmcnt(0) ; GCN_DBG-NEXT: v_readlane_b32 s0, v0, 3 ; GCN_DBG-NEXT: v_readlane_b32 s2, v0, 1 ; GCN_DBG-NEXT: v_readlane_b32 s3, v0, 2 @@ -362,6 +480,9 @@ ; GCN_DBG-NEXT: s_add_i32 s0, s0, s1 ; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3] ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 3 +; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] ; GCN_DBG-NEXT: s_cbranch_vccnz .LBB4_1 ; GCN_DBG-NEXT: s_branch .LBB4_2 entry: diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll --- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll +++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll @@ -48,24 +48,37 @@ ; GCN-O0-NEXT: s_mov_b32 s15, 0xe8f000 ; GCN-O0-NEXT: s_add_u32 s12, s12, s11 ; GCN-O0-NEXT: s_addc_u32 s13, s13, 0 +; GCN-O0-NEXT: ; implicit-def: $vgpr1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v0 +; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: v_writelane_b32 v1, s0, 0 -; GCN-O0-NEXT: v_writelane_b32 v1, s1, 1 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v0 -; GCN-O0-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-O0-NEXT: v_writelane_b32 v0, s0, 0 +; GCN-O0-NEXT: v_writelane_b32 v0, s1, 1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v1 +; GCN-O0-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b32 s0, 1 -; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[2:3], v0, s0 +; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[2:3], v1, s0 ; GCN-O0-NEXT: s_mov_b64 s[0:1], exec -; GCN-O0-NEXT: v_writelane_b32 v1, s0, 2 -; GCN-O0-NEXT: v_writelane_b32 v1, s1, 3 +; GCN-O0-NEXT: v_writelane_b32 v0, s0, 2 +; GCN-O0-NEXT: v_writelane_b32 v0, s1, 3 +; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] ; GCN-O0-NEXT: s_cbranch_execz .LBB0_4 ; GCN-O0-NEXT: ; %bb.1: ; %bb.outer.then +; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: v_readlane_b32 s4, v1, 0 -; GCN-O0-NEXT: v_readlane_b32 s5, v1, 1 +; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] +; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(1) +; GCN-O0-NEXT: v_readlane_b32 s4, v0, 0 +; GCN-O0-NEXT: v_readlane_b32 s5, v0, 1 ; GCN-O0-NEXT: s_mov_b32 s2, 0xf000 ; GCN-O0-NEXT: s_mov_b32 s0, 0 ; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 @@ -73,54 +86,69 @@ ; GCN-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7 ; GCN-O0-NEXT: s_mov_b64 s[6:7], s[0:1] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_ashrrev_i32_e64 v4, 31, v0 -; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: v_mov_b32_e32 v2, v0 +; GCN-O0-NEXT: v_ashrrev_i32_e64 v4, 31, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v1 ; GCN-O0-NEXT: v_mov_b32_e32 v3, v4 ; GCN-O0-NEXT: s_mov_b32 s0, 2 ; GCN-O0-NEXT: v_lshl_b64 v[3:4], v[2:3], s0 ; GCN-O0-NEXT: v_mov_b32_e32 v2, 0 ; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[4:7], 0 addr64 -; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[2:3], v0, s0 +; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[2:3], v1, s0 ; GCN-O0-NEXT: s_mov_b64 s[0:1], exec -; GCN-O0-NEXT: v_writelane_b32 v1, s0, 4 -; GCN-O0-NEXT: v_writelane_b32 v1, s1, 5 +; GCN-O0-NEXT: v_writelane_b32 v0, s0, 4 +; GCN-O0-NEXT: v_writelane_b32 v0, s1, 5 +; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] ; GCN-O0-NEXT: s_cbranch_execz .LBB0_3 ; GCN-O0-NEXT: ; %bb.2: ; %bb.inner.then +; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: v_readlane_b32 s0, v1, 0 -; GCN-O0-NEXT: v_readlane_b32 s1, v1, 1 -; GCN-O0-NEXT: v_mov_b32_e32 v0, 1 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_add_i32_e64 v2, s[2:3], v2, v0 -; GCN-O0-NEXT: v_ashrrev_i32_e64 v4, 31, v2 -; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v3, v4 +; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0 +; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 1 +; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0 +; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v1 +; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v2, v3 ; GCN-O0-NEXT: s_mov_b32 s2, 2 -; GCN-O0-NEXT: v_lshl_b64 v[2:3], v[2:3], s2 +; GCN-O0-NEXT: v_lshl_b64 v[1:2], v[1:2], s2 ; GCN-O0-NEXT: s_mov_b32 s2, 0xf000 ; GCN-O0-NEXT: s_mov_b32 s4, 0 ; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GCN-O0-NEXT: s_mov_b32 s5, s2 ; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] -; GCN-O0-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 +; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 ; GCN-O0-NEXT: .LBB0_3: ; %Flow -; GCN-O0-NEXT: v_readlane_b32 s0, v1, 4 -; GCN-O0-NEXT: v_readlane_b32 s1, v1, 5 +; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GCN-O0-NEXT: s_waitcnt expcnt(0) +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s0, v0, 4 +; GCN-O0-NEXT: v_readlane_b32 s1, v0, 5 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-O0-NEXT: .LBB0_4: ; %bb.outer.end -; GCN-O0-NEXT: v_readlane_b32 s0, v1, 2 -; GCN-O0-NEXT: v_readlane_b32 s1, v1, 3 -; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s0, v0, 2 +; GCN-O0-NEXT: v_readlane_b32 s1, v0, 3 +; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-O0-NEXT: v_mov_b32_e32 v2, 3 -; GCN-O0-NEXT: v_mov_b32_e32 v0, 0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, 0 ; GCN-O0-NEXT: s_mov_b32 m0, -1 -; GCN-O0-NEXT: ds_write_b32 v0, v2 +; GCN-O0-NEXT: ds_write_b32 v1, v2 +; GCN-O0-NEXT: ; kill: killed $vgpr0 ; GCN-O0-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -193,24 +221,37 @@ ; GCN-O0-NEXT: s_mov_b32 s15, 0xe8f000 ; GCN-O0-NEXT: s_add_u32 s12, s12, s11 ; GCN-O0-NEXT: s_addc_u32 s13, s13, 0 +; GCN-O0-NEXT: ; implicit-def: $vgpr1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v0 +; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: v_writelane_b32 v1, s0, 0 -; GCN-O0-NEXT: v_writelane_b32 v1, s1, 1 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v0 -; GCN-O0-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-O0-NEXT: v_writelane_b32 v0, s0, 0 +; GCN-O0-NEXT: v_writelane_b32 v0, s1, 1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v1 +; GCN-O0-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b32 s0, 1 -; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[2:3], v0, s0 +; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[2:3], v1, s0 ; GCN-O0-NEXT: s_mov_b64 s[0:1], exec -; GCN-O0-NEXT: v_writelane_b32 v1, s0, 2 -; GCN-O0-NEXT: v_writelane_b32 v1, s1, 3 +; GCN-O0-NEXT: v_writelane_b32 v0, s0, 2 +; GCN-O0-NEXT: v_writelane_b32 v0, s1, 3 +; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] ; GCN-O0-NEXT: s_cbranch_execz .LBB1_3 ; GCN-O0-NEXT: ; %bb.1: ; %bb.outer.then +; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: v_readlane_b32 s4, v1, 0 -; GCN-O0-NEXT: v_readlane_b32 s5, v1, 1 +; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] +; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(1) +; GCN-O0-NEXT: v_readlane_b32 s4, v0, 0 +; GCN-O0-NEXT: v_readlane_b32 s5, v0, 1 ; GCN-O0-NEXT: s_mov_b32 s2, 0xf000 ; GCN-O0-NEXT: s_mov_b32 s0, 0 ; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 @@ -218,76 +259,92 @@ ; GCN-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7 ; GCN-O0-NEXT: s_mov_b64 s[6:7], s[0:1] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_ashrrev_i32_e64 v4, 31, v0 -; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: v_mov_b32_e32 v2, v0 +; GCN-O0-NEXT: v_ashrrev_i32_e64 v4, 31, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v1 ; GCN-O0-NEXT: v_mov_b32_e32 v3, v4 ; GCN-O0-NEXT: s_mov_b32 s0, 2 ; GCN-O0-NEXT: v_lshl_b64 v[3:4], v[2:3], s0 ; GCN-O0-NEXT: v_mov_b32_e32 v2, 0 ; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[4:7], 0 addr64 -; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[2:3], v0, s0 +; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[2:3], v1, s0 ; GCN-O0-NEXT: s_mov_b64 s[0:1], exec -; GCN-O0-NEXT: v_writelane_b32 v1, s0, 4 -; GCN-O0-NEXT: v_writelane_b32 v1, s1, 5 +; GCN-O0-NEXT: v_writelane_b32 v0, s0, 4 +; GCN-O0-NEXT: v_writelane_b32 v0, s1, 5 +; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] ; GCN-O0-NEXT: s_cbranch_execz .LBB1_4 ; GCN-O0-NEXT: ; %bb.2: ; %bb.inner.then +; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: v_readlane_b32 s0, v1, 0 -; GCN-O0-NEXT: v_readlane_b32 s1, v1, 1 -; GCN-O0-NEXT: v_mov_b32_e32 v0, 1 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_add_i32_e64 v2, s[2:3], v2, v0 -; GCN-O0-NEXT: v_ashrrev_i32_e64 v4, 31, v2 -; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v3, v4 +; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0 +; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 1 +; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0 +; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v1 +; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v2, v3 ; GCN-O0-NEXT: s_mov_b32 s2, 2 -; GCN-O0-NEXT: v_lshl_b64 v[2:3], v[2:3], s2 +; GCN-O0-NEXT: v_lshl_b64 v[1:2], v[1:2], s2 ; GCN-O0-NEXT: s_mov_b32 s2, 0xf000 ; GCN-O0-NEXT: s_mov_b32 s4, 0 ; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GCN-O0-NEXT: s_mov_b32 s5, s2 ; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] -; GCN-O0-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 +; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 ; GCN-O0-NEXT: s_branch .LBB1_4 ; GCN-O0-NEXT: .LBB1_3: ; %Flow -; GCN-O0-NEXT: v_readlane_b32 s0, v1, 2 -; GCN-O0-NEXT: v_readlane_b32 s1, v1, 3 +; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GCN-O0-NEXT: s_waitcnt expcnt(0) +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s0, v0, 2 +; GCN-O0-NEXT: v_readlane_b32 s1, v0, 3 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-O0-NEXT: s_branch .LBB1_5 ; GCN-O0-NEXT: .LBB1_4: ; %bb.inner.end +; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: v_readlane_b32 s2, v1, 4 -; GCN-O0-NEXT: v_readlane_b32 s3, v1, 5 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s2, v0, 4 +; GCN-O0-NEXT: v_readlane_b32 s3, v0, 5 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[2:3] -; GCN-O0-NEXT: v_readlane_b32 s0, v1, 0 -; GCN-O0-NEXT: v_readlane_b32 s1, v1, 1 +; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0 +; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1 ; GCN-O0-NEXT: v_mov_b32_e32 v0, 2 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_add_i32_e64 v2, s[2:3], v2, v0 -; GCN-O0-NEXT: v_ashrrev_i32_e64 v4, 31, v2 -; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v3, v4 -; GCN-O0-NEXT: v_lshl_b64 v[2:3], v[2:3], v0 +; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0 +; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v1 +; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v2, v3 +; GCN-O0-NEXT: v_lshl_b64 v[1:2], v[1:2], v0 ; GCN-O0-NEXT: s_mov_b32 s2, 0xf000 ; GCN-O0-NEXT: s_mov_b32 s4, 0 ; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GCN-O0-NEXT: s_mov_b32 s5, s2 ; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] -; GCN-O0-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 +; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 ; GCN-O0-NEXT: s_branch .LBB1_3 ; GCN-O0-NEXT: .LBB1_5: ; %bb.outer.end -; GCN-O0-NEXT: s_waitcnt expcnt(0) +; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: v_mov_b32_e32 v2, 3 -; GCN-O0-NEXT: v_mov_b32_e32 v0, 0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, 0 ; GCN-O0-NEXT: s_mov_b32 m0, -1 -; GCN-O0-NEXT: ds_write_b32 v0, v2 +; GCN-O0-NEXT: ds_write_b32 v1, v2 +; GCN-O0-NEXT: ; kill: killed $vgpr0 ; GCN-O0-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -373,13 +430,19 @@ ; GCN-O0-NEXT: s_mov_b32 s15, 0xe8f000 ; GCN-O0-NEXT: s_add_u32 s12, s12, s11 ; GCN-O0-NEXT: s_addc_u32 s13, s13, 0 +; GCN-O0-NEXT: ; implicit-def: $vgpr1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v0 +; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] ; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) ; GCN-O0-NEXT: s_mov_b64 s[2:3], s[0:1] -; GCN-O0-NEXT: v_writelane_b32 v1, s2, 0 -; GCN-O0-NEXT: v_writelane_b32 v1, s3, 1 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v0 -; GCN-O0-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_writelane_b32 v0, s2, 0 +; GCN-O0-NEXT: v_writelane_b32 v0, s3, 1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v1 +; GCN-O0-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b32 s2, 0xf000 ; GCN-O0-NEXT: s_mov_b32 s4, 0 ; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 @@ -390,96 +453,130 @@ ; GCN-O0-NEXT: ; implicit-def: $sgpr4 ; GCN-O0-NEXT: v_mov_b32_e32 v4, 0 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: v_mov_b32_e32 v2, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v1 ; GCN-O0-NEXT: v_mov_b32_e32 v3, v4 ; GCN-O0-NEXT: s_mov_b32 s4, 2 ; GCN-O0-NEXT: v_lshl_b64 v[3:4], v[2:3], s4 ; GCN-O0-NEXT: v_mov_b32_e32 v2, 0 ; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[0:3], 0 addr64 ; GCN-O0-NEXT: s_mov_b32 s0, 1 -; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[2:3], v0, s0 +; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[2:3], v1, s0 ; GCN-O0-NEXT: s_mov_b64 s[0:1], exec -; GCN-O0-NEXT: v_writelane_b32 v1, s0, 2 -; GCN-O0-NEXT: v_writelane_b32 v1, s1, 3 +; GCN-O0-NEXT: v_writelane_b32 v0, s0, 2 +; GCN-O0-NEXT: v_writelane_b32 v0, s1, 3 +; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] ; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] ; GCN-O0-NEXT: s_cbranch_execz .LBB2_6 ; GCN-O0-NEXT: ; %bb.1: ; %bb.outer.then +; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] +; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b32 s0, 2 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[0:1], v0, s0 +; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[0:1], v1, s0 ; GCN-O0-NEXT: s_mov_b64 s[2:3], exec ; GCN-O0-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] ; GCN-O0-NEXT: s_xor_b64 s[2:3], s[0:1], s[2:3] -; GCN-O0-NEXT: v_writelane_b32 v1, s2, 4 -; GCN-O0-NEXT: v_writelane_b32 v1, s3, 5 +; GCN-O0-NEXT: v_writelane_b32 v0, s2, 4 +; GCN-O0-NEXT: v_writelane_b32 v0, s3, 5 +; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] ; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] ; GCN-O0-NEXT: s_cbranch_execz .LBB2_2 ; GCN-O0-NEXT: s_branch .LBB2_4 ; GCN-O0-NEXT: .LBB2_2: ; %Flow -; GCN-O0-NEXT: v_readlane_b32 s0, v1, 4 -; GCN-O0-NEXT: v_readlane_b32 s1, v1, 5 +; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-O0-NEXT: s_waitcnt expcnt(0) +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s0, v0, 4 +; GCN-O0-NEXT: v_readlane_b32 s1, v0, 5 ; GCN-O0-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GCN-O0-NEXT: s_and_b64 s[0:1], exec, s[0:1] -; GCN-O0-NEXT: v_writelane_b32 v1, s0, 6 -; GCN-O0-NEXT: v_writelane_b32 v1, s1, 7 +; GCN-O0-NEXT: v_writelane_b32 v0, s0, 6 +; GCN-O0-NEXT: v_writelane_b32 v0, s1, 7 +; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] ; GCN-O0-NEXT: s_xor_b64 exec, exec, s[0:1] ; GCN-O0-NEXT: s_cbranch_execz .LBB2_5 ; GCN-O0-NEXT: ; %bb.3: ; %bb.then +; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: v_readlane_b32 s0, v1, 0 -; GCN-O0-NEXT: v_readlane_b32 s1, v1, 1 -; GCN-O0-NEXT: v_mov_b32_e32 v0, 1 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_add_i32_e64 v2, s[2:3], v2, v0 -; GCN-O0-NEXT: v_ashrrev_i32_e64 v4, 31, v2 -; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v3, v4 +; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0 +; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 1 +; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0 +; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v1 +; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v2, v3 ; GCN-O0-NEXT: s_mov_b32 s2, 2 -; GCN-O0-NEXT: v_lshl_b64 v[2:3], v[2:3], s2 +; GCN-O0-NEXT: v_lshl_b64 v[1:2], v[1:2], s2 ; GCN-O0-NEXT: s_mov_b32 s2, 0xf000 ; GCN-O0-NEXT: s_mov_b32 s4, 0 ; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GCN-O0-NEXT: s_mov_b32 s5, s2 ; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] -; GCN-O0-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 +; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 ; GCN-O0-NEXT: s_branch .LBB2_5 ; GCN-O0-NEXT: .LBB2_4: ; %bb.else +; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: v_readlane_b32 s0, v1, 0 -; GCN-O0-NEXT: v_readlane_b32 s1, v1, 1 -; GCN-O0-NEXT: v_mov_b32_e32 v0, 2 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_add_i32_e64 v2, s[2:3], v2, v0 -; GCN-O0-NEXT: v_ashrrev_i32_e64 v4, 31, v2 -; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v3, v4 -; GCN-O0-NEXT: v_lshl_b64 v[2:3], v[2:3], v0 +; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0 +; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 2 +; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0 +; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v1 +; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v2, v3 +; GCN-O0-NEXT: v_lshl_b64 v[1:2], v[1:2], v0 ; GCN-O0-NEXT: s_mov_b32 s2, 0xf000 ; GCN-O0-NEXT: s_mov_b32 s4, 0 ; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GCN-O0-NEXT: s_mov_b32 s5, s2 ; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] -; GCN-O0-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 +; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 ; GCN-O0-NEXT: s_branch .LBB2_2 ; GCN-O0-NEXT: .LBB2_5: ; %Flow1 -; GCN-O0-NEXT: v_readlane_b32 s0, v1, 6 -; GCN-O0-NEXT: v_readlane_b32 s1, v1, 7 +; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-O0-NEXT: s_waitcnt expcnt(0) +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s0, v0, 6 +; GCN-O0-NEXT: v_readlane_b32 s1, v0, 7 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-O0-NEXT: .LBB2_6: ; %bb.outer.end -; GCN-O0-NEXT: v_readlane_b32 s0, v1, 2 -; GCN-O0-NEXT: v_readlane_b32 s1, v1, 3 -; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s0, v0, 2 +; GCN-O0-NEXT: v_readlane_b32 s1, v0, 3 +; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-O0-NEXT: v_mov_b32_e32 v2, 3 -; GCN-O0-NEXT: v_mov_b32_e32 v0, 0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, 0 ; GCN-O0-NEXT: s_mov_b32 m0, -1 -; GCN-O0-NEXT: ds_write_b32 v0, v2 +; GCN-O0-NEXT: ds_write_b32 v1, v2 +; GCN-O0-NEXT: ; kill: killed $vgpr0 ; GCN-O0-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -579,14 +676,19 @@ ; GCN-O0-NEXT: s_mov_b32 s15, 0xe8f000 ; GCN-O0-NEXT: s_add_u32 s12, s12, s11 ; GCN-O0-NEXT: s_addc_u32 s13, s13, 0 +; GCN-O0-NEXT: ; implicit-def: $vgpr1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v0 +; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v0 -; GCN-O0-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_mov_b32_e32 v2, v1 +; GCN-O0-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b32 s0, 0 ; GCN-O0-NEXT: ; implicit-def: $sgpr0 ; GCN-O0-NEXT: v_mov_b32_e32 v4, 0 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: v_mov_b32_e32 v2, v0 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v1 ; GCN-O0-NEXT: v_mov_b32_e32 v3, v4 ; GCN-O0-NEXT: s_mov_b32 s0, 2 ; GCN-O0-NEXT: s_mov_b32 s1, s0 @@ -601,9 +703,9 @@ ; GCN-O0-NEXT: v_addc_u32_e64 v2, s[2:3], v2, v6, s[2:3] ; GCN-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec ; GCN-O0-NEXT: v_mov_b32_e32 v6, v2 -; GCN-O0-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b32 s1, 0xf000 ; GCN-O0-NEXT: s_mov_b32 s2, 0 ; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 @@ -612,29 +714,43 @@ ; GCN-O0-NEXT: s_mov_b64 s[6:7], s[2:3] ; GCN-O0-NEXT: v_mov_b32_e32 v2, 0 ; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[4:7], 0 addr64 -; GCN-O0-NEXT: v_cmp_lt_u32_e64 s[0:1], v0, s0 +; GCN-O0-NEXT: v_cmp_lt_u32_e64 s[0:1], v1, s0 ; GCN-O0-NEXT: s_mov_b64 s[2:3], exec ; GCN-O0-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] ; GCN-O0-NEXT: s_xor_b64 s[2:3], s[0:1], s[2:3] -; GCN-O0-NEXT: v_writelane_b32 v1, s2, 0 -; GCN-O0-NEXT: v_writelane_b32 v1, s3, 1 +; GCN-O0-NEXT: v_writelane_b32 v0, s2, 0 +; GCN-O0-NEXT: v_writelane_b32 v0, s3, 1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] ; GCN-O0-NEXT: s_cbranch_execz .LBB3_1 ; GCN-O0-NEXT: s_branch .LBB3_4 ; GCN-O0-NEXT: .LBB3_1: ; %Flow2 -; GCN-O0-NEXT: v_readlane_b32 s0, v1, 0 -; GCN-O0-NEXT: v_readlane_b32 s1, v1, 1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GCN-O0-NEXT: s_waitcnt expcnt(0) +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0 +; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1 ; GCN-O0-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GCN-O0-NEXT: s_and_b64 s[0:1], exec, s[0:1] -; GCN-O0-NEXT: v_writelane_b32 v1, s0, 2 -; GCN-O0-NEXT: v_writelane_b32 v1, s1, 3 +; GCN-O0-NEXT: v_writelane_b32 v0, s0, 2 +; GCN-O0-NEXT: v_writelane_b32 v0, s1, 3 +; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_xor_b64 exec, exec, s[0:1] ; GCN-O0-NEXT: s_cbranch_execz .LBB3_8 ; GCN-O0-NEXT: ; %bb.2: ; %bb.outer.then +; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] +; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b32 s0, 0xf000 ; GCN-O0-NEXT: s_mov_b32 s2, 0 ; GCN-O0-NEXT: s_mov_b32 s4, s2 @@ -647,17 +763,20 @@ ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[0:3], 0 addr64 offset:4 ; GCN-O0-NEXT: s_mov_b32 s0, 2 -; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s0 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[2:3], v1, s0 ; GCN-O0-NEXT: s_mov_b64 s[0:1], exec -; GCN-O0-NEXT: v_writelane_b32 v1, s0, 4 -; GCN-O0-NEXT: v_writelane_b32 v1, s1, 5 +; GCN-O0-NEXT: v_writelane_b32 v0, s0, 4 +; GCN-O0-NEXT: v_writelane_b32 v0, s1, 5 +; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] ; GCN-O0-NEXT: s_cbranch_execz .LBB3_7 ; GCN-O0-NEXT: ; %bb.3: ; %bb.inner.then -; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt expcnt(1) +; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b32 s0, 0xf000 ; GCN-O0-NEXT: s_mov_b32 s2, 0 ; GCN-O0-NEXT: s_mov_b32 s4, s2 @@ -666,14 +785,19 @@ ; GCN-O0-NEXT: s_mov_b32 s1, s2 ; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] +; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: v_mov_b32_e32 v0, 2 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 offset:8 +; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 offset:8 ; GCN-O0-NEXT: s_branch .LBB3_7 ; GCN-O0-NEXT: .LBB3_4: ; %bb.outer.else -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GCN-O0-NEXT: s_waitcnt expcnt(0) +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] +; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b32 s1, 0xf000 ; GCN-O0-NEXT: s_mov_b32 s0, 0 ; GCN-O0-NEXT: s_mov_b32 s2, s0 @@ -682,21 +806,23 @@ ; GCN-O0-NEXT: s_mov_b32 s5, s0 ; GCN-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7 ; GCN-O0-NEXT: s_mov_b64 s[6:7], s[2:3] -; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: v_mov_b32_e32 v2, 3 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[4:7], 0 addr64 offset:12 -; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s0 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[2:3], v1, s0 ; GCN-O0-NEXT: s_mov_b64 s[0:1], exec -; GCN-O0-NEXT: v_writelane_b32 v1, s0, 6 -; GCN-O0-NEXT: v_writelane_b32 v1, s1, 7 +; GCN-O0-NEXT: v_writelane_b32 v0, s0, 6 +; GCN-O0-NEXT: v_writelane_b32 v0, s1, 7 +; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] ; GCN-O0-NEXT: s_cbranch_execz .LBB3_6 ; GCN-O0-NEXT: ; %bb.5: ; %bb.inner.then2 -; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt expcnt(1) +; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b32 s0, 0xf000 ; GCN-O0-NEXT: s_mov_b32 s2, 0 ; GCN-O0-NEXT: s_mov_b32 s4, s2 @@ -705,27 +831,43 @@ ; GCN-O0-NEXT: s_mov_b32 s1, s2 ; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] +; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: v_mov_b32_e32 v0, 4 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 offset:16 +; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 offset:16 ; GCN-O0-NEXT: .LBB3_6: ; %Flow -; GCN-O0-NEXT: v_readlane_b32 s0, v1, 6 -; GCN-O0-NEXT: v_readlane_b32 s1, v1, 7 +; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GCN-O0-NEXT: s_waitcnt expcnt(0) +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s0, v0, 6 +; GCN-O0-NEXT: v_readlane_b32 s1, v0, 7 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-O0-NEXT: s_branch .LBB3_1 ; GCN-O0-NEXT: .LBB3_7: ; %Flow1 -; GCN-O0-NEXT: v_readlane_b32 s0, v1, 4 -; GCN-O0-NEXT: v_readlane_b32 s1, v1, 5 +; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GCN-O0-NEXT: s_waitcnt expcnt(0) +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s0, v0, 4 +; GCN-O0-NEXT: v_readlane_b32 s1, v0, 5 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-O0-NEXT: .LBB3_8: ; %bb.outer.end -; GCN-O0-NEXT: v_readlane_b32 s0, v1, 2 -; GCN-O0-NEXT: v_readlane_b32 s1, v1, 3 -; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s0, v0, 2 +; GCN-O0-NEXT: v_readlane_b32 s1, v0, 3 +; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-O0-NEXT: v_mov_b32_e32 v2, 3 -; GCN-O0-NEXT: v_mov_b32_e32 v0, 0 +; GCN-O0-NEXT: v_mov_b32_e32 v1, 0 ; GCN-O0-NEXT: s_mov_b32 m0, -1 -; GCN-O0-NEXT: ds_write_b32 v0, v2 +; GCN-O0-NEXT: ds_write_b32 v1, v2 +; GCN-O0-NEXT: ; kill: killed $vgpr0 ; GCN-O0-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -789,23 +931,35 @@ ; GCN-O0-NEXT: s_mov_b32 s15, 0xe8f000 ; GCN-O0-NEXT: s_add_u32 s12, s12, s11 ; GCN-O0-NEXT: s_addc_u32 s13, s13, 0 +; GCN-O0-NEXT: ; implicit-def: $vgpr1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v0 +; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] ; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: v_writelane_b32 v1, s0, 0 -; GCN-O0-NEXT: v_writelane_b32 v1, s1, 1 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v0 -; GCN-O0-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-O0-NEXT: v_writelane_b32 v0, s0, 0 +; GCN-O0-NEXT: v_writelane_b32 v0, s1, 1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v1 +; GCN-O0-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b32 s0, 1 -; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[2:3], v0, s0 +; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[2:3], v1, s0 ; GCN-O0-NEXT: s_mov_b64 s[0:1], exec -; GCN-O0-NEXT: v_writelane_b32 v1, s0, 2 -; GCN-O0-NEXT: v_writelane_b32 v1, s1, 3 +; GCN-O0-NEXT: v_writelane_b32 v0, s0, 2 +; GCN-O0-NEXT: v_writelane_b32 v0, s1, 3 +; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] ; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] ; GCN-O0-NEXT: s_cbranch_execz .LBB4_2 ; GCN-O0-NEXT: ; %bb.1: ; %bb.then ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_readlane_b32 s0, v1, 0 ; GCN-O0-NEXT: v_readlane_b32 s1, v1, 1 ; GCN-O0-NEXT: s_mov_b32 s2, 0xf000 @@ -814,20 +968,24 @@ ; GCN-O0-NEXT: s_mov_b32 s5, s2 ; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_ashrrev_i32_e64 v0, 31, v2 -; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v3, v0 +; GCN-O0-NEXT: v_ashrrev_i32_e64 v2, 31, v0 +; GCN-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v1, v2 ; GCN-O0-NEXT: s_mov_b32 s4, 2 -; GCN-O0-NEXT: v_lshl_b64 v[2:3], v[2:3], s4 +; GCN-O0-NEXT: v_lshl_b64 v[1:2], v[0:1], s4 ; GCN-O0-NEXT: v_mov_b32_e32 v0, 0 -; GCN-O0-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 +; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 ; GCN-O0-NEXT: .LBB4_2: ; %bb.end -; GCN-O0-NEXT: v_readlane_b32 s0, v1, 2 -; GCN-O0-NEXT: v_readlane_b32 s1, v1, 3 +; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-O0-NEXT: s_waitcnt expcnt(0) +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s0, v0, 2 +; GCN-O0-NEXT: v_readlane_b32 s1, v0, 3 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-O0-NEXT: s_barrier +; GCN-O0-NEXT: ; kill: killed $vgpr0 ; GCN-O0-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -919,96 +1077,127 @@ ; GCN-O0: ; %bb.0: ; %bb ; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: ; implicit-def: $vgpr1 +; GCN-O0-NEXT: v_mov_b32_e32 v1, v0 +; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: s_waitcnt expcnt(1) +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 s[4:5], 0 ; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GCN-O0-NEXT: s_waitcnt expcnt(1) -; GCN-O0-NEXT: v_writelane_b32 v1, s6, 0 -; GCN-O0-NEXT: v_writelane_b32 v1, s7, 1 -; GCN-O0-NEXT: v_writelane_b32 v1, s4, 2 -; GCN-O0-NEXT: v_writelane_b32 v1, s5, 3 +; GCN-O0-NEXT: s_waitcnt vmcnt(1) +; GCN-O0-NEXT: v_writelane_b32 v0, s6, 0 +; GCN-O0-NEXT: v_writelane_b32 v0, s7, 1 +; GCN-O0-NEXT: v_writelane_b32 v0, s4, 2 +; GCN-O0-NEXT: v_writelane_b32 v0, s5, 3 +; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] ; GCN-O0-NEXT: .LBB5_1: ; %bb1 ; GCN-O0-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: v_readlane_b32 s8, v1, 2 -; GCN-O0-NEXT: v_readlane_b32 s9, v1, 3 -; GCN-O0-NEXT: v_readlane_b32 s6, v1, 0 -; GCN-O0-NEXT: v_readlane_b32 s7, v1, 1 -; GCN-O0-NEXT: v_writelane_b32 v1, s6, 4 -; GCN-O0-NEXT: v_writelane_b32 v1, s7, 5 +; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(1) +; GCN-O0-NEXT: v_readlane_b32 s8, v0, 2 +; GCN-O0-NEXT: v_readlane_b32 s9, v0, 3 +; GCN-O0-NEXT: v_readlane_b32 s6, v0, 0 +; GCN-O0-NEXT: v_readlane_b32 s7, v0, 1 +; GCN-O0-NEXT: v_writelane_b32 v0, s6, 4 +; GCN-O0-NEXT: v_writelane_b32 v0, s7, 5 ; GCN-O0-NEXT: s_mov_b32 s4, 0x207 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_cmp_lt_i32_e64 s[4:5], v0, s4 +; GCN-O0-NEXT: v_cmp_lt_i32_e64 s[4:5], v1, s4 ; GCN-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] -; GCN-O0-NEXT: v_writelane_b32 v1, s4, 6 -; GCN-O0-NEXT: v_writelane_b32 v1, s5, 7 -; GCN-O0-NEXT: v_writelane_b32 v1, s6, 0 -; GCN-O0-NEXT: v_writelane_b32 v1, s7, 1 +; GCN-O0-NEXT: v_writelane_b32 v0, s4, 6 +; GCN-O0-NEXT: v_writelane_b32 v0, s5, 7 +; GCN-O0-NEXT: v_writelane_b32 v0, s6, 0 +; GCN-O0-NEXT: v_writelane_b32 v0, s7, 1 ; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GCN-O0-NEXT: v_writelane_b32 v1, s6, 2 -; GCN-O0-NEXT: v_writelane_b32 v1, s7, 3 +; GCN-O0-NEXT: v_writelane_b32 v0, s6, 2 +; GCN-O0-NEXT: v_writelane_b32 v0, s7, 3 +; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] ; GCN-O0-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN-O0-NEXT: s_cbranch_execnz .LBB5_1 ; GCN-O0-NEXT: ; %bb.2: ; %bb2 ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: v_readlane_b32 s4, v1, 6 -; GCN-O0-NEXT: v_readlane_b32 s5, v1, 7 +; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(1) +; GCN-O0-NEXT: v_readlane_b32 s4, v0, 6 +; GCN-O0-NEXT: v_readlane_b32 s5, v0, 7 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-O0-NEXT: s_mov_b32 s6, 0 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[4:5], v0, s6 -; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 -; GCN-O0-NEXT: v_writelane_b32 v1, s4, 8 -; GCN-O0-NEXT: v_writelane_b32 v1, s5, 9 +; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[4:5], v1, s6 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v1, s6 +; GCN-O0-NEXT: v_writelane_b32 v0, s4, 8 +; GCN-O0-NEXT: v_writelane_b32 v0, s5, 9 ; GCN-O0-NEXT: s_mov_b32 s4, 0 ; GCN-O0-NEXT: s_mov_b32 s8, s4 ; GCN-O0-NEXT: s_mov_b32 s9, s4 ; GCN-O0-NEXT: s_mov_b32 s10, s4 ; GCN-O0-NEXT: s_mov_b32 s11, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v5, s11 -; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_mov_b32_e32 v1, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s11 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 s[4:5], exec -; GCN-O0-NEXT: v_writelane_b32 v1, s4, 10 -; GCN-O0-NEXT: v_writelane_b32 v1, s5, 11 +; GCN-O0-NEXT: v_writelane_b32 v0, s4, 10 +; GCN-O0-NEXT: v_writelane_b32 v0, s5, 11 +; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] ; GCN-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] ; GCN-O0-NEXT: s_cbranch_execz .LBB5_5 ; GCN-O0-NEXT: ; %bb.3: ; %bb4 ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: s_waitcnt expcnt(0) +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] ; GCN-O0-NEXT: ; implicit-def: $sgpr4 -; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 -; GCN-O0-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GCN-O0-NEXT: v_mov_b32_e32 v1, s4 +; GCN-O0-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen ; GCN-O0-NEXT: s_mov_b32 s4, 0 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_cmp_lt_f32_e64 s[6:7], v0, s4 +; GCN-O0-NEXT: v_cmp_lt_f32_e64 s[6:7], v1, s4 ; GCN-O0-NEXT: s_mov_b32 s8, s4 ; GCN-O0-NEXT: s_mov_b32 s9, s4 ; GCN-O0-NEXT: s_mov_b32 s10, s4 ; GCN-O0-NEXT: s_mov_b32 s11, s4 -; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: v_mov_b32_e32 v2, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v5, s11 -; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_mov_b32_e32 v1, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s11 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 s[4:5], exec -; GCN-O0-NEXT: v_writelane_b32 v1, s4, 12 -; GCN-O0-NEXT: v_writelane_b32 v1, s5, 13 +; GCN-O0-NEXT: v_writelane_b32 v0, s4, 12 +; GCN-O0-NEXT: v_writelane_b32 v0, s5, 13 +; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] ; GCN-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] ; GCN-O0-NEXT: s_cbranch_execz .LBB5_6 @@ -1026,135 +1215,173 @@ ; GCN-O0-NEXT: s_mov_b32 s6, s9 ; GCN-O0-NEXT: s_mov_b32 s7, s8 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: v_mov_b32_e32 v2, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v5, s7 -; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_branch .LBB5_6 ; GCN-O0-NEXT: .LBB5_5: ; %Flow2 ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-O0-NEXT: v_readlane_b32 s4, v1, 10 -; GCN-O0-NEXT: v_readlane_b32 s5, v1, 11 -; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_readlane_b32 s4, v4, 10 +; GCN-O0-NEXT: v_readlane_b32 s5, v4, 11 +; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_branch .LBB5_7 ; GCN-O0-NEXT: .LBB5_6: ; %Flow ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-O0-NEXT: v_readlane_b32 s4, v1, 12 -; GCN-O0-NEXT: v_readlane_b32 s5, v1, 13 -; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_readlane_b32 s4, v4, 12 +; GCN-O0-NEXT: v_readlane_b32 s5, v4, 13 +; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_branch .LBB5_5 ; GCN-O0-NEXT: .LBB5_7: ; %bb10 ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 -; GCN-O0-NEXT: v_readlane_b32 s6, v1, 8 -; GCN-O0-NEXT: v_readlane_b32 s7, v1, 9 +; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: s_waitcnt expcnt(3) +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s6, v0, 8 +; GCN-O0-NEXT: v_readlane_b32 s7, v0, 9 ; GCN-O0-NEXT: s_mov_b64 s[4:5], -1 -; GCN-O0-NEXT: v_writelane_b32 v1, s4, 14 -; GCN-O0-NEXT: v_writelane_b32 v1, s5, 15 +; GCN-O0-NEXT: v_writelane_b32 v0, s4, 14 +; GCN-O0-NEXT: v_writelane_b32 v0, s5, 15 ; GCN-O0-NEXT: s_mov_b64 s[4:5], exec -; GCN-O0-NEXT: v_writelane_b32 v1, s4, 16 -; GCN-O0-NEXT: v_writelane_b32 v1, s5, 17 +; GCN-O0-NEXT: v_writelane_b32 v0, s4, 16 +; GCN-O0-NEXT: v_writelane_b32 v0, s5, 17 +; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] ; GCN-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] ; GCN-O0-NEXT: s_cbranch_execz .LBB5_9 ; GCN-O0-NEXT: ; %bb.8: ; %Flow1 ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: s_waitcnt expcnt(0) +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] ; GCN-O0-NEXT: s_mov_b64 s[4:5], 0 ; GCN-O0-NEXT: s_xor_b64 s[4:5], exec, -1 -; GCN-O0-NEXT: v_writelane_b32 v1, s4, 14 -; GCN-O0-NEXT: v_writelane_b32 v1, s5, 15 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_writelane_b32 v0, s4, 14 +; GCN-O0-NEXT: v_writelane_b32 v0, s5, 15 +; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] ; GCN-O0-NEXT: .LBB5_9: ; %Flow3 ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-O0-NEXT: v_readlane_b32 s8, v1, 16 -; GCN-O0-NEXT: v_readlane_b32 s9, v1, 17 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s8, v4, 16 +; GCN-O0-NEXT: v_readlane_b32 s9, v4, 17 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-O0-NEXT: v_readlane_b32 s6, v1, 4 -; GCN-O0-NEXT: v_readlane_b32 s7, v1, 5 -; GCN-O0-NEXT: v_readlane_b32 s4, v1, 14 -; GCN-O0-NEXT: v_readlane_b32 s5, v1, 15 +; GCN-O0-NEXT: v_readlane_b32 s6, v4, 4 +; GCN-O0-NEXT: v_readlane_b32 s7, v4, 5 +; GCN-O0-NEXT: v_readlane_b32 s4, v4, 14 +; GCN-O0-NEXT: v_readlane_b32 s5, v4, 15 ; GCN-O0-NEXT: s_and_b64 s[4:5], exec, s[4:5] ; GCN-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] ; GCN-O0-NEXT: s_mov_b64 s[6:7], 0 ; GCN-O0-NEXT: s_mov_b64 s[8:9], s[4:5] -; GCN-O0-NEXT: v_writelane_b32 v1, s8, 0 -; GCN-O0-NEXT: v_writelane_b32 v1, s9, 1 -; GCN-O0-NEXT: v_writelane_b32 v1, s6, 2 -; GCN-O0-NEXT: v_writelane_b32 v1, s7, 3 +; GCN-O0-NEXT: v_writelane_b32 v4, s8, 0 +; GCN-O0-NEXT: v_writelane_b32 v4, s9, 1 +; GCN-O0-NEXT: v_writelane_b32 v4, s6, 2 +; GCN-O0-NEXT: v_writelane_b32 v4, s7, 3 ; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GCN-O0-NEXT: v_writelane_b32 v1, s6, 18 -; GCN-O0-NEXT: v_writelane_b32 v1, s7, 19 +; GCN-O0-NEXT: v_writelane_b32 v4, s6, 18 +; GCN-O0-NEXT: v_writelane_b32 v4, s7, 19 +; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN-O0-NEXT: s_cbranch_execnz .LBB5_1 ; GCN-O0-NEXT: ; %bb.10: ; %bb12 -; GCN-O0-NEXT: v_readlane_b32 s4, v1, 18 -; GCN-O0-NEXT: v_readlane_b32 s5, v1, 19 +; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: s_waitcnt expcnt(3) +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s4, v0, 18 +; GCN-O0-NEXT: v_readlane_b32 s5, v0, 19 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-O0-NEXT: ; %bb.11: ; %bb12 +; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_mov_b32_e32 v0, v5 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v4 ; GCN-O0-NEXT: ; implicit-def: $sgpr4 ; GCN-O0-NEXT: v_mov_b32_e32 v6, s4 -; GCN-O0-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GCN-O0-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen ; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-O0-NEXT: v_mov_b32_e32 v0, v4 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v3 ; GCN-O0-NEXT: ; implicit-def: $sgpr4 ; GCN-O0-NEXT: v_mov_b32_e32 v6, s4 -; GCN-O0-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GCN-O0-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen ; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-O0-NEXT: v_mov_b32_e32 v0, v3 +; GCN-O0-NEXT: v_mov_b32_e32 v5, v2 ; GCN-O0-NEXT: ; implicit-def: $sgpr4 ; GCN-O0-NEXT: v_mov_b32_e32 v6, s4 -; GCN-O0-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen -; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-O0-NEXT: v_mov_b32_e32 v0, v2 +; GCN-O0-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec ; GCN-O0-NEXT: ; implicit-def: $sgpr4 ; GCN-O0-NEXT: v_mov_b32_e32 v2, s4 -; GCN-O0-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GCN-O0-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: ; kill: killed $vgpr0 ; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-O0-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll --- a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll @@ -10,7 +10,7 @@ ; GCN-LABEL: {{^}}divergent_if_endif: -; VGPR: workitem_private_segment_byte_size = 12{{$}} +; VGPR: workitem_private_segment_byte_size = 16{{$}} ; GCN: {{^}}; %bb.0: @@ -19,7 +19,7 @@ ; Spill load ; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], 0 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill -; GCN: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]\]]], v0, s{{[0-9]+}} +; GCN: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]\]]], v{{[0-9]+}}, s{{[0-9]+}} ; Spill saved exec ; GCN: s_mov_b64 s[[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]], exec @@ -82,13 +82,13 @@ } ; GCN-LABEL: {{^}}divergent_loop: -; VGPR: workitem_private_segment_byte_size = 16{{$}} +; VGPR: workitem_private_segment_byte_size = 20{{$}} ; GCN: {{^}}; %bb.0: ; GCN-DAG: s_mov_b32 m0, -1 ; GCN-DAG: v_mov_b32_e32 [[PTR0:v[0-9]+]], 0{{$}} ; GCN: ds_read_b32 [[LOAD0:v[0-9]+]], [[PTR0]] -; GCN: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], v0, s{{[0-9]+}} +; GCN: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, s{{[0-9]+}} ; Spill load ; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], 0 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill @@ -166,7 +166,7 @@ ; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], 0 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill ; GCN: s_mov_b32 [[ZERO:s[0-9]+]], 0 -; GCN: v_cmp_ne_u32_e64 [[CMP0:s\[[0-9]+:[0-9]\]]], v0, [[ZERO]] +; GCN: v_cmp_ne_u32_e64 [[CMP0:s\[[0-9]+:[0-9]\]]], v{{[0-9]+}}, [[ZERO]] ; GCN: s_mov_b64 s[[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]], exec ; GCN: s_and_b64 s[[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]], s[[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]], [[CMP0]] @@ -175,6 +175,7 @@ ; Spill saved exec ; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]] ; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[SAVEEXEC_HI]], [[SAVEEXEC_HI_LANE:[0-9]+]] +; VGPR: buffer_store_dword [[SPILL_VGPR]], off, s[0:3], 0 offset:[[VREG_SAVE_RESTORE_OFFSET:[0-9]+]] ; 4-byte Folded Spill ; VMEM: v_writelane_b32 v[[V_SAVEEXEC:[0-9]+]], s[[SAVEEXEC_LO]], 0 ; VMEM: v_writelane_b32 v[[V_SAVEEXEC]], s[[SAVEEXEC_HI]], 1 @@ -187,6 +188,7 @@ ; GCN-NEXT: s_branch [[ELSE:.LBB[0-9]+_[0-9]+]] ; GCN: [[FLOW]]: ; %Flow +; VGPR: buffer_load_dword [[SPILL_VGPR:v[0-9]+]], off, s[0:3], 0 offset:[[VREG_SAVE_RESTORE_OFFSET]] ; 4-byte Folded Reload ; GCN: buffer_load_dword [[FLOW_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[FLOW_VAL_OFFSET:[0-9]+]] ; 4-byte Folded Reload ; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]] ; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]] diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll --- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll @@ -31,11 +31,10 @@ ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[18:19] +; GCN-NEXT: v_writelane_b32 v40, s16, 2 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 -; GCN-NEXT: v_writelane_b32 v41, s16, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: s_getpc_b64 s[16:17] ; GCN-NEXT: s_add_u32 s16, s16, func_v2f32@rel32@lo+4 @@ -43,10 +42,9 @@ ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 -; GCN-NEXT: v_readlane_b32 s4, v41, 0 +; GCN-NEXT: v_readlane_b32 s4, v40, 2 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s4 @@ -69,11 +67,10 @@ ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[18:19] +; GCN-NEXT: v_writelane_b32 v40, s16, 2 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 -; GCN-NEXT: v_writelane_b32 v41, s16, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: s_getpc_b64 s[16:17] ; GCN-NEXT: s_add_u32 s16, s16, func_v3f32@rel32@lo+4 @@ -81,10 +78,9 @@ ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 -; GCN-NEXT: v_readlane_b32 s4, v41, 0 +; GCN-NEXT: v_readlane_b32 s4, v40, 2 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s4 @@ -107,11 +103,10 @@ ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[18:19] +; GCN-NEXT: v_writelane_b32 v40, s16, 2 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 -; GCN-NEXT: v_writelane_b32 v41, s16, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: s_getpc_b64 s[16:17] ; GCN-NEXT: s_add_u32 s16, s16, func_v4f16@rel32@lo+4 @@ -119,10 +114,9 @@ ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 -; GCN-NEXT: v_readlane_b32 s4, v41, 0 +; GCN-NEXT: v_readlane_b32 s4, v40, 2 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s4 @@ -145,11 +139,10 @@ ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[18:19] +; GCN-NEXT: v_writelane_b32 v40, s16, 2 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 -; GCN-NEXT: v_writelane_b32 v41, s16, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: s_getpc_b64 s[16:17] ; GCN-NEXT: s_add_u32 s16, s16, func_struct@rel32@lo+4 @@ -158,10 +151,9 @@ ; GCN-NEXT: v_mov_b32_e32 v1, v4 ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 -; GCN-NEXT: v_readlane_b32 s4, v41, 0 +; GCN-NEXT: v_readlane_b32 s4, v40, 2 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s4 diff --git a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll --- a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll +++ b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll @@ -20,8 +20,8 @@ ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[18:19] +; CHECK-NEXT: v_writelane_b32 v40, s16, 16 ; CHECK-NEXT: v_writelane_b32 v40, s30, 0 ; CHECK-NEXT: v_writelane_b32 v40, s31, 1 ; CHECK-NEXT: v_writelane_b32 v40, s34, 2 @@ -48,7 +48,6 @@ ; CHECK-NEXT: v_writelane_b32 v40, s47, 15 ; CHECK-NEXT: s_load_dwordx2 s[46:47], s[4:5], 0x0 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] -; CHECK-NEXT: v_writelane_b32 v42, s16, 0 ; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; CHECK-NEXT: v_mov_b32_e32 v41, v31 ; CHECK-NEXT: s_mov_b32 s42, s15 @@ -92,10 +91,9 @@ ; CHECK-NEXT: v_readlane_b32 s34, v40, 2 ; CHECK-NEXT: v_readlane_b32 s31, v40, 1 ; CHECK-NEXT: v_readlane_b32 s30, v40, 0 -; CHECK-NEXT: v_readlane_b32 s4, v42, 0 +; CHECK-NEXT: v_readlane_b32 s4, v40, 16 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[6:7] ; CHECK-NEXT: s_addk_i32 s32, 0xfc00 ; CHECK-NEXT: s_mov_b32 s33, s4 diff --git a/llvm/test/CodeGen/AMDGPU/extend-wwm-virt-reg-liveness.mir b/llvm/test/CodeGen/AMDGPU/extend-wwm-virt-reg-liveness.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/extend-wwm-virt-reg-liveness.mir @@ -0,0 +1,279 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -start-before=si-lower-sgpr-spills -stop-after=virtregrewriter,1 -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s + +# Tests to check the conservative lieness extension for the wwm registers during SGPR spill lowering. + +# Even though the VGPR can be shared for the wwm-operand (writelane/readlane get inserted for the SGPR spills) +# and the regular operand (%0), they get different registers as we conservatively extend the liveness of the +# wwm-operands. +--- +name: test_single_block +tracksRegLiveness: true +frameInfo: + maxAlignment: 4 +stack: + - { id: 0, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill } +machineFunctionInfo: + isEntryFunction: false + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + stackPtrOffsetReg: '$sgpr32' + frameOffsetReg: '$sgpr33' + hasSpilledSGPRs: true +body: | + bb.0: + liveins: $sgpr4, $vgpr2_vgpr3 + ; GCN-LABEL: name: test_single_block + ; GCN: liveins: $sgpr4, $vgpr2_vgpr3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF + ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, killed $vgpr0 + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: $sgpr4 = V_READLANE_B32 $vgpr0, 0 + ; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e32 20, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr2_vgpr3, killed renamable $vgpr1, 0, 0, implicit $exec + ; GCN-NEXT: KILL killed renamable $vgpr0 + ; GCN-NEXT: SI_RETURN + SI_SPILL_S32_SAVE killed $sgpr4, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 + S_NOP 0 + renamable $sgpr4 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 + %0:vgpr_32 = V_MOV_B32_e32 20, implicit $exec + GLOBAL_STORE_DWORD $vgpr2_vgpr3, %0:vgpr_32, 0, 0, implicit $exec + SI_RETURN +... + +# Due to the presence of wwm-operand in the divergent flow, the regular variable (%0) shouldn't get the same register +# allocated for the wwm-operand in writelane/readlane when the SGPR spill is lowered. + +--- +name: test_if_else +tracksRegLiveness: true +frameInfo: + maxAlignment: 4 +stack: + - { id: 0, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill } +machineFunctionInfo: + isEntryFunction: false + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + stackPtrOffsetReg: '$sgpr32' + frameOffsetReg: '$sgpr33' + hasSpilledSGPRs: true +body: | + ; GCN-LABEL: name: test_if_else + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.1(0x80000000) + ; GCN-NEXT: liveins: $sgpr6, $sgpr10_sgpr11 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF + ; GCN-NEXT: S_BRANCH %bb.1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; GCN-NEXT: liveins: $sgpr6, $vgpr0, $sgpr10_sgpr11 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e32 10, implicit $exec + ; GCN-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: successors: %bb.3(0x80000000) + ; GCN-NEXT: liveins: $sgpr6, $vgpr0, $sgpr10_sgpr11 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr6, 0, killed $vgpr0 + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: $sgpr6 = V_READLANE_B32 $vgpr0, 0 + ; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e32 20, implicit $exec + ; GCN-NEXT: S_BRANCH %bb.3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.3: + ; GCN-NEXT: liveins: $vgpr0, $vgpr1, $sgpr10_sgpr11 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr5 = V_READFIRSTLANE_B32 killed $vgpr1, implicit $exec + ; GCN-NEXT: S_STORE_DWORD_IMM $sgpr5, $sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: KILL killed renamable $vgpr0 + ; GCN-NEXT: SI_RETURN + bb.0: + liveins: $sgpr6, $sgpr10_sgpr11 + S_BRANCH %bb.1 + bb.1: + liveins: $sgpr6, $sgpr10_sgpr11 + %0:vgpr_32 = V_MOV_B32_e32 10, implicit $exec + S_CBRANCH_EXECZ %bb.3, implicit $exec + bb.2: + liveins: $sgpr6, $sgpr10_sgpr11 + SI_SPILL_S32_SAVE killed $sgpr6, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 + S_NOP 0 + renamable $sgpr6 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 + %0:vgpr_32 = V_MOV_B32_e32 20, implicit $exec + S_BRANCH %bb.3 + bb.3: + liveins: $sgpr10_sgpr11 + $sgpr5 = V_READFIRSTLANE_B32 %0:vgpr_32, implicit $exec + S_STORE_DWORD_IMM $sgpr5, $sgpr10_sgpr11, 0, 0 + SI_RETURN +... + +# The wwm-register usage outside the loop should have the interference marked with +# all the regular virtual registers used in the test. The divergent loop index value (%1) +# can actually share the same VGPR as the wwm-operand. But since we extend the liveness of +# the wwm operand, an interference will always exist between them. + +--- +name: test_loop +tracksRegLiveness: true +frameInfo: + maxAlignment: 4 +stack: + - { id: 0, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill } +machineFunctionInfo: + isEntryFunction: false + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + stackPtrOffsetReg: '$sgpr32' + frameOffsetReg: '$sgpr33' + hasSpilledSGPRs: true +body: | + ; GCN-LABEL: name: test_loop + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; GCN-NEXT: liveins: $sgpr4, $sgpr10_sgpr11 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF + ; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e32 10, implicit $exec + ; GCN-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: liveins: $sgpr4, $vgpr0, $sgpr10_sgpr11 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, killed $vgpr0 + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: $sgpr4 = V_READLANE_B32 $vgpr0, 0 + ; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e32 20, implicit $exec + ; GCN-NEXT: S_BRANCH %bb.2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: successors: %bb.3(0x80000000) + ; GCN-NEXT: liveins: $sgpr4, $vgpr0, $vgpr1, $sgpr10_sgpr11 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_STORE_DWORD_IMM $sgpr4, $sgpr10_sgpr11, 0, 0 + ; GCN-NEXT: $sgpr5 = V_READFIRSTLANE_B32 killed $vgpr1, implicit $exec + ; GCN-NEXT: S_STORE_DWORD_IMM $sgpr5, $sgpr10_sgpr11, 0, 4 + ; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e32 5, implicit $exec + ; GCN-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec + ; GCN-NEXT: S_BRANCH %bb.3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.3: + ; GCN-NEXT: successors: %bb.5(0x40000000), %bb.4(0x40000000) + ; GCN-NEXT: liveins: $vgpr0, $vgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vcc = V_CMP_EQ_U32_e64 0, $vgpr1, implicit $exec + ; GCN-NEXT: $sgpr6_sgpr7 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.5, implicit $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.4: + ; GCN-NEXT: successors: %bb.3(0x80000000) + ; GCN-NEXT: liveins: $vgpr0, $vgpr1, $sgpr6_sgpr7 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: renamable $vgpr1 = V_SUB_U32_e32 1, killed $vgpr1, implicit $exec + ; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e32 killed $vgpr1, implicit $exec + ; GCN-NEXT: S_BRANCH %bb.3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.5: + ; GCN-NEXT: liveins: $vgpr0, $sgpr6_sgpr7 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = S_OR_B64 $exec, $sgpr6_sgpr7, implicit-def $scc + ; GCN-NEXT: KILL killed renamable $vgpr0 + ; GCN-NEXT: SI_RETURN + bb.0: + liveins: $sgpr4, $sgpr10_sgpr11 + %0:vgpr_32 = V_MOV_B32_e32 10, implicit $exec + S_CBRANCH_EXECZ %bb.2, implicit $exec + bb.1: + liveins: $sgpr4, $sgpr10_sgpr11 + SI_SPILL_S32_SAVE killed $sgpr4, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 + S_NOP 0 + renamable $sgpr4 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 + %0:vgpr_32 = V_MOV_B32_e32 20, implicit $exec + S_BRANCH %bb.2 + bb.2: + liveins: $sgpr4, $sgpr10_sgpr11 + S_STORE_DWORD_IMM $sgpr4, $sgpr10_sgpr11, 0, 0 + $sgpr5 = V_READFIRSTLANE_B32 %0:vgpr_32, implicit $exec + S_STORE_DWORD_IMM $sgpr5, $sgpr10_sgpr11, 0, 4 + %1:vgpr_32 = V_MOV_B32_e32 5, implicit $exec + S_CBRANCH_EXECZ %bb.3, implicit $exec + S_BRANCH %bb.3 + bb.3: + $vcc = V_CMP_EQ_U32_e64 0, %1:vgpr_32, implicit $exec + $sgpr6_sgpr7 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec + S_CBRANCH_SCC1 %bb.5, implicit $scc + bb.4: + liveins: $sgpr6_sgpr7 + %2:vgpr_32 = V_SUB_U32_e32 1, %1:vgpr_32, implicit $exec + %1:vgpr_32 = V_MOV_B32_e32 %2:vgpr_32, implicit $exec + S_BRANCH %bb.3 + bb.5: + liveins: $sgpr6_sgpr7 + $exec = S_OR_B64 $exec, $sgpr6_sgpr7, implicit-def $scc + SI_RETURN +... + +# There must be one KILL instruction for the wwm-operand in every return block. +# Due to that, the wwm-register allocated should be different from the ones +# allocated for the regular virtual registers. + +--- +name: test_multiple_return_blocks +tracksRegLiveness: true +frameInfo: + maxAlignment: 4 +stack: + - { id: 0, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill } +machineFunctionInfo: + isEntryFunction: false + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + stackPtrOffsetReg: '$sgpr32' + frameOffsetReg: '$sgpr33' + hasSpilledSGPRs: true +body: | + ; GCN-LABEL: name: test_multiple_return_blocks + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; GCN-NEXT: liveins: $sgpr4, $vgpr2_vgpr3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF + ; GCN-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: liveins: $sgpr4, $vgpr0, $vgpr2_vgpr3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, killed $vgpr0 + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: $sgpr4 = V_READLANE_B32 $vgpr0, 0 + ; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e32 10, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr2_vgpr3, killed renamable $vgpr1, 0, 0, implicit $exec + ; GCN-NEXT: KILL killed renamable $vgpr0 + ; GCN-NEXT: SI_RETURN + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: liveins: $vgpr0, $vgpr2_vgpr3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e32 20, implicit $exec + ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr2_vgpr3, killed renamable $vgpr1, 0, 0, implicit $exec + ; GCN-NEXT: KILL killed renamable $vgpr0 + ; GCN-NEXT: SI_RETURN + bb.0: + liveins: $sgpr4, $vgpr2_vgpr3 + S_CBRANCH_EXECZ %bb.2, implicit $exec + bb.1: + liveins: $sgpr4, $vgpr2_vgpr3 + SI_SPILL_S32_SAVE killed $sgpr4, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 + S_NOP 0 + renamable $sgpr4 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 + %0:vgpr_32 = V_MOV_B32_e32 10, implicit $exec + GLOBAL_STORE_DWORD $vgpr2_vgpr3, %0:vgpr_32, 0, 0, implicit $exec + SI_RETURN + bb.2: + liveins: $vgpr2_vgpr3 + %1:vgpr_32 = V_MOV_B32_e32 20, implicit $exec + GLOBAL_STORE_DWORD $vgpr2_vgpr3, %1:vgpr_32, 0, 0, implicit $exec + SI_RETURN +... diff --git a/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll b/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll --- a/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll @@ -17,10 +17,9 @@ ; GCN-NEXT: s_and_b32 s33, s33, 0xfffff000 ; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:100 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[18:19] ; GCN-NEXT: s_addk_i32 s32, 0x3000 -; GCN-NEXT: v_writelane_b32 v43, s16, 0 +; GCN-NEXT: v_writelane_b32 v42, s16, 2 ; GCN-NEXT: s_getpc_b64 s[16:17] ; GCN-NEXT: s_add_u32 s16, s16, extern_func@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s17, s17, extern_func@gotpcrel32@hi+12 @@ -56,10 +55,9 @@ ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: v_readlane_b32 s31, v42, 1 ; GCN-NEXT: v_readlane_b32 s30, v42, 0 -; GCN-NEXT: v_readlane_b32 s4, v43, 0 +; GCN-NEXT: v_readlane_b32 s4, v42, 2 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:100 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_addk_i32 s32, 0xd000 ; GCN-NEXT: s_mov_b32 s33, s4 diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll @@ -117,21 +117,15 @@ ; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 ; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; FLAT_SCR_OPT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; FLAT_SCR_OPT-NEXT: s_mov_b32 s4, exec_lo -; FLAT_SCR_OPT-NEXT: s_mov_b32 exec_lo, 3 -; FLAT_SCR_OPT-NEXT: s_mov_b32 s5, 0 -; FLAT_SCR_OPT-NEXT: scratch_store_dword off, v0, s5 +; FLAT_SCR_OPT-NEXT: ; implicit-def: $vgpr0 ; FLAT_SCR_OPT-NEXT: s_waitcnt lgkmcnt(0) ; FLAT_SCR_OPT-NEXT: v_writelane_b32 v0, s2, 0 -; FLAT_SCR_OPT-NEXT: s_mov_b32 s5, 4 ; FLAT_SCR_OPT-NEXT: v_writelane_b32 v0, s3, 1 -; FLAT_SCR_OPT-NEXT: scratch_store_dword off, v0, s5 ; 4-byte Folded Spill +; FLAT_SCR_OPT-NEXT: s_or_saveexec_b32 s105, -1 +; FLAT_SCR_OPT-NEXT: s_mov_b32 s2, 4 +; FLAT_SCR_OPT-NEXT: scratch_store_dword off, v0, s2 ; 4-byte Folded Spill ; FLAT_SCR_OPT-NEXT: s_waitcnt_depctr 0xffe3 -; FLAT_SCR_OPT-NEXT: s_mov_b32 s5, 0 -; FLAT_SCR_OPT-NEXT: scratch_load_dword v0, off, s5 -; FLAT_SCR_OPT-NEXT: s_waitcnt vmcnt(0) -; FLAT_SCR_OPT-NEXT: s_waitcnt_depctr 0xffe3 -; FLAT_SCR_OPT-NEXT: s_mov_b32 exec_lo, s4 +; FLAT_SCR_OPT-NEXT: s_mov_b32 exec_lo, s105 ; FLAT_SCR_OPT-NEXT: s_load_dword vcc_lo, s[0:1], 0x8 ; FLAT_SCR_OPT-NEXT: ; kill: killed $sgpr0_sgpr1 ; FLAT_SCR_OPT-NEXT: ;;#ASMSTART @@ -228,44 +222,31 @@ ; FLAT_SCR_OPT-NEXT: ;;#ASMEND ; FLAT_SCR_OPT-NEXT: ;;#ASMSTART ; FLAT_SCR_OPT-NEXT: ;;#ASMEND -; FLAT_SCR_OPT-NEXT: s_mov_b32 s2, exec_lo -; FLAT_SCR_OPT-NEXT: s_mov_b32 exec_lo, 3 -; FLAT_SCR_OPT-NEXT: s_mov_b32 s3, 0 -; FLAT_SCR_OPT-NEXT: scratch_store_dword off, v1, s3 -; FLAT_SCR_OPT-NEXT: s_waitcnt_depctr 0xffe3 -; FLAT_SCR_OPT-NEXT: s_mov_b32 s3, 4 -; FLAT_SCR_OPT-NEXT: scratch_load_dword v1, off, s3 ; 4-byte Folded Reload +; FLAT_SCR_OPT-NEXT: s_or_saveexec_b32 s105, -1 +; FLAT_SCR_OPT-NEXT: s_mov_b32 s0, 4 +; FLAT_SCR_OPT-NEXT: scratch_load_dword v1, off, s0 ; 4-byte Folded Reload ; FLAT_SCR_OPT-NEXT: s_waitcnt_depctr 0xffe3 -; FLAT_SCR_OPT-NEXT: s_mov_b32 s3, 0 +; FLAT_SCR_OPT-NEXT: s_mov_b32 exec_lo, s105 ; FLAT_SCR_OPT-NEXT: s_waitcnt vmcnt(0) ; FLAT_SCR_OPT-NEXT: v_readlane_b32 s0, v1, 0 ; FLAT_SCR_OPT-NEXT: v_readlane_b32 s1, v1, 1 -; FLAT_SCR_OPT-NEXT: scratch_load_dword v1, off, s3 -; FLAT_SCR_OPT-NEXT: s_waitcnt vmcnt(0) -; FLAT_SCR_OPT-NEXT: s_waitcnt_depctr 0xffe3 -; FLAT_SCR_OPT-NEXT: s_mov_b32 exec_lo, s2 -; FLAT_SCR_OPT-NEXT: v_mov_b32_e32 v1, 0 -; FLAT_SCR_OPT-NEXT: global_store_dword v1, v0, s[0:1] +; FLAT_SCR_OPT-NEXT: v_mov_b32_e32 v2, 0 +; FLAT_SCR_OPT-NEXT: ; kill: killed $vgpr1 +; FLAT_SCR_OPT-NEXT: global_store_dword v2, v0, s[0:1] ; FLAT_SCR_OPT-NEXT: s_endpgm ; ; FLAT_SCR_ARCH-LABEL: test: ; FLAT_SCR_ARCH: ; %bb.0: ; FLAT_SCR_ARCH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; FLAT_SCR_ARCH-NEXT: s_mov_b32 s4, exec_lo -; FLAT_SCR_ARCH-NEXT: s_mov_b32 exec_lo, 3 -; FLAT_SCR_ARCH-NEXT: s_mov_b32 s5, 0 -; FLAT_SCR_ARCH-NEXT: scratch_store_dword off, v0, s5 +; FLAT_SCR_ARCH-NEXT: ; implicit-def: $vgpr0 ; FLAT_SCR_ARCH-NEXT: s_waitcnt lgkmcnt(0) ; FLAT_SCR_ARCH-NEXT: v_writelane_b32 v0, s2, 0 -; FLAT_SCR_ARCH-NEXT: s_mov_b32 s5, 4 ; FLAT_SCR_ARCH-NEXT: v_writelane_b32 v0, s3, 1 -; FLAT_SCR_ARCH-NEXT: scratch_store_dword off, v0, s5 ; 4-byte Folded Spill +; FLAT_SCR_ARCH-NEXT: s_or_saveexec_b32 s105, -1 +; FLAT_SCR_ARCH-NEXT: s_mov_b32 s2, 4 +; FLAT_SCR_ARCH-NEXT: scratch_store_dword off, v0, s2 ; 4-byte Folded Spill ; FLAT_SCR_ARCH-NEXT: s_waitcnt_depctr 0xffe3 -; FLAT_SCR_ARCH-NEXT: s_mov_b32 s5, 0 -; FLAT_SCR_ARCH-NEXT: scratch_load_dword v0, off, s5 -; FLAT_SCR_ARCH-NEXT: s_waitcnt vmcnt(0) -; FLAT_SCR_ARCH-NEXT: s_waitcnt_depctr 0xffe3 -; FLAT_SCR_ARCH-NEXT: s_mov_b32 exec_lo, s4 +; FLAT_SCR_ARCH-NEXT: s_mov_b32 exec_lo, s105 ; FLAT_SCR_ARCH-NEXT: s_load_dword vcc_lo, s[0:1], 0x8 ; FLAT_SCR_ARCH-NEXT: ; kill: killed $sgpr0_sgpr1 ; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART @@ -362,24 +343,17 @@ ; FLAT_SCR_ARCH-NEXT: ;;#ASMEND ; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART ; FLAT_SCR_ARCH-NEXT: ;;#ASMEND -; FLAT_SCR_ARCH-NEXT: s_mov_b32 s2, exec_lo -; FLAT_SCR_ARCH-NEXT: s_mov_b32 exec_lo, 3 -; FLAT_SCR_ARCH-NEXT: s_mov_b32 s3, 0 -; FLAT_SCR_ARCH-NEXT: scratch_store_dword off, v1, s3 -; FLAT_SCR_ARCH-NEXT: s_waitcnt_depctr 0xffe3 -; FLAT_SCR_ARCH-NEXT: s_mov_b32 s3, 4 -; FLAT_SCR_ARCH-NEXT: scratch_load_dword v1, off, s3 ; 4-byte Folded Reload +; FLAT_SCR_ARCH-NEXT: s_or_saveexec_b32 s105, -1 +; FLAT_SCR_ARCH-NEXT: s_mov_b32 s0, 4 +; FLAT_SCR_ARCH-NEXT: scratch_load_dword v1, off, s0 ; 4-byte Folded Reload ; FLAT_SCR_ARCH-NEXT: s_waitcnt_depctr 0xffe3 -; FLAT_SCR_ARCH-NEXT: s_mov_b32 s3, 0 +; FLAT_SCR_ARCH-NEXT: s_mov_b32 exec_lo, s105 ; FLAT_SCR_ARCH-NEXT: s_waitcnt vmcnt(0) ; FLAT_SCR_ARCH-NEXT: v_readlane_b32 s0, v1, 0 ; FLAT_SCR_ARCH-NEXT: v_readlane_b32 s1, v1, 1 -; FLAT_SCR_ARCH-NEXT: scratch_load_dword v1, off, s3 -; FLAT_SCR_ARCH-NEXT: s_waitcnt vmcnt(0) -; FLAT_SCR_ARCH-NEXT: s_waitcnt_depctr 0xffe3 -; FLAT_SCR_ARCH-NEXT: s_mov_b32 exec_lo, s2 -; FLAT_SCR_ARCH-NEXT: v_mov_b32_e32 v1, 0 -; FLAT_SCR_ARCH-NEXT: global_store_dword v1, v0, s[0:1] +; FLAT_SCR_ARCH-NEXT: v_mov_b32_e32 v2, 0 +; FLAT_SCR_ARCH-NEXT: ; kill: killed $vgpr1 +; FLAT_SCR_ARCH-NEXT: global_store_dword v2, v0, s[0:1] ; FLAT_SCR_ARCH-NEXT: s_endpgm call void asm sideeffect "", "~{s[0:7]}" () call void asm sideeffect "", "~{s[8:15]}" () diff --git a/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir b/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir --- a/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir @@ -12,14 +12,13 @@ body: | bb.0: ; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_exec_lo - ; CHECK: liveins: $vgpr0 - ; CHECK-NEXT: {{ $}} + ; CHECK: renamable $vgpr0 = IMPLICIT_DEF ; CHECK-NEXT: S_NOP 0, implicit-def $exec_lo ; CHECK-NEXT: $sgpr0 = S_MOV_B32 $exec_lo - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 + ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: S_NOP 0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0, implicit killed renamable $sgpr0 - ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 + ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0 ; CHECK-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0 ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec S_NOP 0, implicit-def $exec_lo @@ -38,14 +37,13 @@ body: | bb.0: ; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_exec_hi - ; CHECK: liveins: $vgpr0 - ; CHECK-NEXT: {{ $}} + ; CHECK: renamable $vgpr0 = IMPLICIT_DEF ; CHECK-NEXT: S_NOP 0, implicit-def $exec_hi ; CHECK-NEXT: $sgpr0 = S_MOV_B32 $exec_hi - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 + ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: S_NOP 0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0, implicit killed renamable $sgpr0 - ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 + ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0 ; CHECK-NEXT: $exec_hi = S_MOV_B32 killed $sgpr0 ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec S_NOP 0, implicit-def $exec_hi @@ -64,17 +62,16 @@ body: | bb.0: ; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_exec - ; CHECK: liveins: $vgpr0 - ; CHECK-NEXT: {{ $}} + ; CHECK: renamable $vgpr0 = IMPLICIT_DEF ; CHECK-NEXT: S_NOP 0, implicit-def $exec ; CHECK-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1 - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, $vgpr0, implicit $sgpr0_sgpr1 + ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1 + ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, killed $vgpr0, implicit $sgpr0_sgpr1 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1 ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1 ; CHECK-NEXT: S_NOP 0, implicit-def dead renamable $sgpr2_sgpr3, implicit-def dead renamable $sgpr0_sgpr1, implicit killed renamable $sgpr0_sgpr1 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1 - ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1 + ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 killed $vgpr0, 1 ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1 ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec S_NOP 0, implicit-def $exec @@ -96,13 +93,12 @@ body: | bb.0: ; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_exec_lo - ; CHECK: liveins: $vgpr0 - ; CHECK-NEXT: {{ $}} + ; CHECK: renamable $vgpr0 = IMPLICIT_DEF ; CHECK-NEXT: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $exec_lo - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 + ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0 - ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 + ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0 ; CHECK-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0 ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec S_NOP 0, implicit-def %0:sreg_32, implicit-def %1:sreg_32, implicit-def $exec_lo @@ -120,13 +116,12 @@ body: | bb.0: ; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_exec_hi - ; CHECK: liveins: $vgpr0 - ; CHECK-NEXT: {{ $}} + ; CHECK: renamable $vgpr0 = IMPLICIT_DEF ; CHECK-NEXT: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $exec_hi - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 + ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0 - ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 + ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0 ; CHECK-NEXT: $exec_hi = S_MOV_B32 killed $sgpr0 ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec S_NOP 0, implicit-def %0:sreg_32, implicit-def %1:sreg_32, implicit-def $exec_hi @@ -144,16 +139,15 @@ body: | bb.0: ; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_exec - ; CHECK: liveins: $vgpr0 - ; CHECK-NEXT: {{ $}} + ; CHECK: renamable $vgpr0 = IMPLICIT_DEF ; CHECK-NEXT: S_NOP 0, implicit-def renamable $sgpr0_sgpr1, implicit-def dead renamable $sgpr2_sgpr3, implicit-def $exec - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1 - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, $vgpr0, implicit $sgpr0_sgpr1 + ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1 + ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, killed $vgpr0, implicit $sgpr0_sgpr1 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1 ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1 ; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr0_sgpr1, implicit-def dead renamable $sgpr2_sgpr3, implicit-def dead renamable $sgpr0_sgpr1 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1 - ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1 + ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 killed $vgpr0, 1 ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1 ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec S_NOP 0, implicit-def %0:sreg_64, implicit-def %1:sreg_64, implicit-def $exec diff --git a/llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir b/llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir --- a/llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir @@ -13,14 +13,13 @@ bb.0: ; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_m0 - ; CHECK: liveins: $vgpr0 - ; CHECK-NEXT: {{ $}} + ; CHECK: renamable $vgpr0 = IMPLICIT_DEF ; CHECK-NEXT: S_NOP 0, implicit-def $m0 ; CHECK-NEXT: $sgpr0 = S_MOV_B32 $m0 - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 + ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: S_NOP 0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0, implicit killed renamable $sgpr0 - ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 + ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0 ; CHECK-NEXT: $m0 = S_MOV_B32 killed $sgpr0 ; CHECK-NEXT: S_NOP 0 ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec @@ -44,13 +43,12 @@ bb.0: ; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_m0 - ; CHECK: liveins: $vgpr0 - ; CHECK-NEXT: {{ $}} + ; CHECK: renamable $vgpr0 = IMPLICIT_DEF ; CHECK-NEXT: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $m0 - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 + ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0 - ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 + ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0 ; CHECK-NEXT: $m0 = S_MOV_B32 killed $sgpr0 ; CHECK-NEXT: S_NOP 0 ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll --- a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll @@ -14,12 +14,11 @@ ; SPILL-TO-VGPR-NEXT: s_mov_b32 s33, s32 ; SPILL-TO-VGPR-NEXT: s_or_saveexec_b64 s[8:9], -1 ; SPILL-TO-VGPR-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; SPILL-TO-VGPR-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[8:9] +; SPILL-TO-VGPR-NEXT: v_writelane_b32 v40, s4, 2 ; SPILL-TO-VGPR-NEXT: s_addk_i32 s32, 0x400 ; SPILL-TO-VGPR-NEXT: v_writelane_b32 v40, s30, 0 ; SPILL-TO-VGPR-NEXT: v_mov_b32_e32 v0, 0 -; SPILL-TO-VGPR-NEXT: v_writelane_b32 v41, s4, 0 ; SPILL-TO-VGPR-NEXT: v_writelane_b32 v40, s31, 1 ; SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) @@ -29,10 +28,9 @@ ; SPILL-TO-VGPR-NEXT: s_swappc_b64 s[30:31], s[4:5] ; SPILL-TO-VGPR-NEXT: v_readlane_b32 s31, v40, 1 ; SPILL-TO-VGPR-NEXT: v_readlane_b32 s30, v40, 0 -; SPILL-TO-VGPR-NEXT: v_readlane_b32 s4, v41, 0 +; SPILL-TO-VGPR-NEXT: v_readlane_b32 s4, v40, 2 ; SPILL-TO-VGPR-NEXT: s_or_saveexec_b64 s[6:7], -1 ; SPILL-TO-VGPR-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; SPILL-TO-VGPR-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[6:7] ; SPILL-TO-VGPR-NEXT: s_addk_i32 s32, 0xfc00 ; SPILL-TO-VGPR-NEXT: s_mov_b32 s33, s4 diff --git a/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll b/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll --- a/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll @@ -8,7 +8,7 @@ ; SDAG-LABEL: gfx_func: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: s_mov_b32 s38, s33 +; SDAG-NEXT: s_mov_b32 s36, s33 ; SDAG-NEXT: s_mov_b32 s33, s32 ; SDAG-NEXT: s_or_saveexec_b64 s[34:35], -1 ; SDAG-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill @@ -81,14 +81,14 @@ ; SDAG-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; SDAG-NEXT: s_mov_b64 exec, s[34:35] ; SDAG-NEXT: s_addk_i32 s32, 0xfc00 -; SDAG-NEXT: s_mov_b32 s33, s38 +; SDAG-NEXT: s_mov_b32 s33, s36 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: gfx_func: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s38, s33 +; GISEL-NEXT: s_mov_b32 s36, s33 ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill @@ -161,7 +161,7 @@ ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[34:35] ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 -; GISEL-NEXT: s_mov_b32 s33, s38 +; GISEL-NEXT: s_mov_b32 s33, s36 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] call void @extern_c_func() diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll @@ -101,12 +101,11 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_i1@rel32@lo+4 @@ -115,10 +114,9 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -132,26 +130,23 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i1@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i1@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -165,27 +160,24 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: v_mov_b32_e32 v0, 1 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i1@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i1@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: scratch_store_b8 off, v0, s32 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -199,26 +191,23 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i1@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i1@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -237,13 +226,12 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_i1_signext@rel32@lo+4 @@ -253,10 +241,9 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -270,28 +257,25 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i1_signext@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i1_signext@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -305,29 +289,25 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_u8 v0, v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i1_signext@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i1_signext@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11-NEXT: scratch_store_b8 off, v0, s32 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -341,28 +321,25 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i1_signext@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i1_signext@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -382,13 +359,12 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_i1_zeroext@rel32@lo+4 @@ -398,10 +374,9 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -415,28 +390,25 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i1_zeroext@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i1_zeroext@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -450,29 +422,25 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_u8 v0, v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i1_zeroext@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i1_zeroext@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11-NEXT: scratch_store_b8 off, v0, s32 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -486,28 +454,25 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i1_zeroext@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i1_zeroext@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -527,12 +492,11 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_i8@rel32@lo+4 @@ -540,10 +504,9 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -557,25 +520,22 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i8@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i8@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -589,27 +549,23 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i8@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i8@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -623,25 +579,22 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i8@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i8@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -660,13 +613,12 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: global_load_sbyte v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_i8_signext@rel32@lo+4 @@ -674,10 +626,9 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -691,26 +642,23 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_sbyte v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i8_signext@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i8_signext@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -724,28 +672,24 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_i8 v0, v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i8_signext@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i8_signext@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -759,26 +703,23 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_sbyte v0, v[0:1], off glc dlc ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i8_signext@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i8_signext@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -798,13 +739,12 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_i8_zeroext@rel32@lo+4 @@ -812,10 +752,9 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -829,26 +768,23 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i8_zeroext@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i8_zeroext@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -862,28 +798,24 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_u8 v0, v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i8_zeroext@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i8_zeroext@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -897,26 +829,23 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i8_zeroext@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i8_zeroext@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -936,12 +865,11 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_i16@rel32@lo+4 @@ -949,10 +877,9 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -966,25 +893,22 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i16@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -998,27 +922,23 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i16@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i16@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -1032,25 +952,22 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i16@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -1069,13 +986,12 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_i16_signext@rel32@lo+4 @@ -1083,10 +999,9 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -1100,26 +1015,23 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_ushort v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i16_signext@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i16_signext@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -1133,28 +1045,24 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_u16 v0, v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i16_signext@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i16_signext@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -1168,26 +1076,23 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_ushort v0, v[0:1], off glc dlc ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i16_signext@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i16_signext@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -1207,13 +1112,12 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_i16_zeroext@rel32@lo+4 @@ -1221,10 +1125,9 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -1238,26 +1141,23 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_ushort v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i16_zeroext@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i16_zeroext@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -1271,28 +1171,24 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_u16 v0, v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i16_zeroext@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i16_zeroext@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -1306,26 +1202,23 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_ushort v0, v[0:1], off glc dlc ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i16_zeroext@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i16_zeroext@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -1345,12 +1238,11 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_i32@rel32@lo+4 @@ -1358,10 +1250,9 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -1375,25 +1266,22 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i32@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -1407,27 +1295,23 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i32@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -1441,25 +1325,22 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i32@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -1478,13 +1359,12 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_i64@rel32@lo+4 @@ -1492,10 +1372,9 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -1509,26 +1388,23 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i64@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i64@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -1542,27 +1418,23 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i64@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i64@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -1576,26 +1448,23 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i64@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i64@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -1614,14 +1483,13 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v2i64@rel32@lo+4 @@ -1629,10 +1497,9 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -1646,27 +1513,24 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i64@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i64@rel32@hi+12 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -1680,29 +1544,25 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i64@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i64@rel32@hi+12 ; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -1716,27 +1576,24 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i64@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i64@rel32@hi+12 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -1756,15 +1613,14 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1 ; GFX9-NEXT: v_mov_b32_e32 v1, 2 ; GFX9-NEXT: v_mov_b32_e32 v2, 3 ; GFX9-NEXT: v_mov_b32_e32 v3, 4 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v2i64@rel32@lo+4 @@ -1772,10 +1628,9 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -1789,28 +1644,25 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-NEXT: v_mov_b32_e32 v3, 4 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i64@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i64@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -1824,28 +1676,24 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 ; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i64@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i64@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -1859,28 +1707,25 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i64@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i64@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -1899,16 +1744,15 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v4, 1 ; GFX9-NEXT: v_mov_b32_e32 v5, 2 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v3i64@rel32@lo+4 @@ -1916,10 +1760,9 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -1933,29 +1776,26 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: v_mov_b32_e32 v4, 1 ; GFX10-NEXT: v_mov_b32_e32 v5, 2 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i64@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3i64@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -1969,29 +1809,25 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v5, 2 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, 1 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 -; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i64@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i64@rel32@hi+12 +; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -2005,29 +1841,26 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 2 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i64@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i64@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -2049,18 +1882,17 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v4, 1 ; GFX9-NEXT: v_mov_b32_e32 v5, 2 ; GFX9-NEXT: v_mov_b32_e32 v6, 3 ; GFX9-NEXT: v_mov_b32_e32 v7, 4 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v4i64@rel32@lo+4 @@ -2068,10 +1900,9 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -2085,31 +1916,28 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: v_mov_b32_e32 v4, 1 ; GFX10-NEXT: v_mov_b32_e32 v5, 2 ; GFX10-NEXT: v_mov_b32_e32 v6, 3 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v7, 4 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i64@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v4i64@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -2123,30 +1951,26 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v5, 2 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, 1 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: v_dual_mov_b32 v6, 3 :: v_dual_mov_b32 v7, 4 ; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i64@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i64@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -2160,31 +1984,28 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 3 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 4 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i64@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i64@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -2205,12 +2026,11 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x4400 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_f16@rel32@lo+4 @@ -2218,10 +2038,9 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -2235,25 +2054,22 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x4400 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_f16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_f16@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -2267,27 +2083,23 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x4400 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_f16@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_f16@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -2301,25 +2113,22 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x4400 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_f16@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -2338,12 +2147,11 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 4.0 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_f32@rel32@lo+4 @@ -2351,10 +2159,9 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -2368,25 +2175,22 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 4.0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_f32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_f32@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -2400,27 +2204,23 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: v_mov_b32_e32 v0, 4.0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_f32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_f32@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -2434,25 +2234,22 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 4.0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_f32@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -2471,13 +2268,12 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX9-NEXT: v_mov_b32_e32 v1, 2.0 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v2f32@rel32@lo+4 @@ -2485,10 +2281,9 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -2502,26 +2297,23 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2f32@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -2535,27 +2327,23 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2f32@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -2569,26 +2357,23 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2f32@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -2607,14 +2392,13 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX9-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX9-NEXT: v_mov_b32_e32 v2, 4.0 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v3f32@rel32@lo+4 @@ -2622,10 +2406,9 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -2639,27 +2422,24 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3f32@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -2673,28 +2453,24 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0 ; GFX11-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3f32@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -2708,27 +2484,24 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3f32@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -2747,8 +2520,8 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1.0 @@ -2756,7 +2529,6 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX9-NEXT: v_mov_b32_e32 v3, -1.0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0.5 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v5f32@rel32@lo+4 @@ -2764,10 +2536,9 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -2781,29 +2552,26 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX10-NEXT: v_mov_b32_e32 v3, -1.0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0.5 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v5f32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v5f32@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -2817,29 +2585,25 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0 ; GFX11-NEXT: v_dual_mov_b32 v2, 4.0 :: v_dual_mov_b32 v3, -1.0 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0.5 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v5f32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v5f32@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -2853,29 +2617,26 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, -1.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 0.5 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v5f32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v5f32@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -2894,13 +2655,12 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40100000 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_f64@rel32@lo+4 @@ -2908,10 +2668,9 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -2925,26 +2684,23 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x40100000 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_f64@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_f64@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -2958,27 +2714,23 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x40100000 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_f64@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_f64@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -2992,26 +2744,23 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x40100000 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f64@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_f64@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -3030,15 +2779,14 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x40100000 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v2f64@rel32@lo+4 @@ -3046,10 +2794,9 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -3063,28 +2810,25 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0x40100000 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f64@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2f64@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -3098,28 +2842,24 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 2.0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40100000 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f64@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2f64@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -3133,28 +2873,25 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 0x40100000 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f64@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2f64@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -3173,8 +2910,8 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -3183,7 +2920,6 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, 0x40100000 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0x40200000 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v3f64@rel32@lo+4 @@ -3191,10 +2927,9 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -3208,30 +2943,27 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0x40100000 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, 0x40200000 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f64@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3f64@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -3245,29 +2977,25 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 2.0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40100000 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 0x40200000 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f64@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3f64@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -3281,30 +3009,27 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 0x40100000 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 0x40200000 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f64@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3f64@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -3323,12 +3048,11 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v2i16@rel32@lo+4 @@ -3336,10 +3060,9 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -3353,25 +3076,22 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i16@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -3385,27 +3105,23 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i16@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i16@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -3419,25 +3135,22 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_dword v0, v[0:1], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i16@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -3457,12 +3170,11 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v3i16@rel32@lo+4 @@ -3470,10 +3182,9 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -3487,25 +3198,22 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3i16@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -3519,27 +3227,23 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i16@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i16@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -3553,25 +3257,22 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i16@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -3591,12 +3292,11 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v3f16@rel32@lo+4 @@ -3604,10 +3304,9 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -3621,25 +3320,22 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3f16@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -3653,27 +3349,23 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f16@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3f16@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -3687,25 +3379,22 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3f16@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -3725,13 +3414,12 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX9-NEXT: v_mov_b32_e32 v1, 3 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v3i16@rel32@lo+4 @@ -3739,10 +3427,9 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -3756,26 +3443,23 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX10-NEXT: v_mov_b32_e32 v1, 3 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3i16@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -3789,27 +3473,23 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: v_dual_mov_b32 v0, 0x20001 :: v_dual_mov_b32 v1, 3 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i16@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i16@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -3823,26 +3503,23 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 3 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i16@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -3861,13 +3538,12 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x40003c00 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x4400 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v3f16@rel32@lo+4 @@ -3875,10 +3551,9 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -3892,26 +3567,23 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x40003c00 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x4400 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3f16@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -3925,28 +3597,24 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x40003c00 ; GFX11-NEXT: v_mov_b32_e32 v1, 0x4400 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f16@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3f16@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -3960,26 +3628,23 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x40003c00 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x4400 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3f16@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -3998,12 +3663,11 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v4i16@rel32@lo+4 @@ -4011,10 +3675,9 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -4028,25 +3691,22 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v4i16@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -4060,27 +3720,23 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i16@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i16@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -4094,25 +3750,22 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i16@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -4132,13 +3785,12 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40003 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v4i16@rel32@lo+4 @@ -4146,10 +3798,9 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -4163,26 +3814,23 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x40003 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v4i16@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -4196,28 +3844,24 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX11-NEXT: v_mov_b32_e32 v1, 0x40003 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i16@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i16@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -4231,26 +3875,23 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x40003 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i16@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -4269,12 +3910,11 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v2f16@rel32@lo+4 @@ -4282,10 +3922,9 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -4299,25 +3938,22 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2f16@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -4331,27 +3967,23 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f16@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2f16@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -4365,25 +3997,22 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_dword v0, v[0:1], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2f16@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -4403,12 +4032,11 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v2i32@rel32@lo+4 @@ -4416,10 +4044,9 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -4433,25 +4060,22 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i32@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -4465,27 +4089,23 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i32@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -4499,25 +4119,22 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i32@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -4537,13 +4154,12 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1 ; GFX9-NEXT: v_mov_b32_e32 v1, 2 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v2i32@rel32@lo+4 @@ -4551,10 +4167,9 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -4568,26 +4183,23 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i32@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -4601,27 +4213,23 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i32@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -4635,26 +4243,23 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i32@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -4673,14 +4278,13 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 3 ; GFX9-NEXT: v_mov_b32_e32 v1, 4 ; GFX9-NEXT: v_mov_b32_e32 v2, 5 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v3i32@rel32@lo+4 @@ -4688,10 +4292,9 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -4705,27 +4308,24 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 3 ; GFX10-NEXT: v_mov_b32_e32 v1, 4 ; GFX10-NEXT: v_mov_b32_e32 v2, 5 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3i32@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -4739,28 +4339,24 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 4 ; GFX11-NEXT: v_mov_b32_e32 v2, 5 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -4774,27 +4370,24 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 3 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 4 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 5 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -4813,15 +4406,14 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 3 ; GFX9-NEXT: v_mov_b32_e32 v1, 4 ; GFX9-NEXT: v_mov_b32_e32 v2, 5 ; GFX9-NEXT: v_mov_b32_e32 v3, 6 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v3i32_i32@rel32@lo+4 @@ -4829,10 +4421,9 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -4846,28 +4437,25 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 3 ; GFX10-NEXT: v_mov_b32_e32 v1, 4 ; GFX10-NEXT: v_mov_b32_e32 v2, 5 ; GFX10-NEXT: v_mov_b32_e32 v3, 6 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i32_i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3i32_i32@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -4881,28 +4469,24 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 4 ; GFX11-NEXT: v_dual_mov_b32 v2, 5 :: v_dual_mov_b32 v3, 6 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32_i32@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -4916,28 +4500,25 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 3 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 4 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 5 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 6 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32_i32@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -4956,12 +4537,11 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v4i32@rel32@lo+4 @@ -4969,10 +4549,9 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -4986,25 +4565,22 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v4i32@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -5018,27 +4594,23 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i32@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -5052,25 +4624,22 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i32@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -5090,15 +4659,14 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1 ; GFX9-NEXT: v_mov_b32_e32 v1, 2 ; GFX9-NEXT: v_mov_b32_e32 v2, 3 ; GFX9-NEXT: v_mov_b32_e32 v3, 4 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v4i32@rel32@lo+4 @@ -5106,10 +4674,9 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -5123,28 +4690,25 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-NEXT: v_mov_b32_e32 v3, 4 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v4i32@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -5158,28 +4722,24 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 ; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i32@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -5193,28 +4753,25 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i32@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -5233,8 +4790,8 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1 @@ -5242,7 +4799,6 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, 3 ; GFX9-NEXT: v_mov_b32_e32 v3, 4 ; GFX9-NEXT: v_mov_b32_e32 v4, 5 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v5i32@rel32@lo+4 @@ -5250,10 +4806,9 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -5267,29 +4822,26 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-NEXT: v_mov_b32_e32 v3, 4 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, 5 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v5i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v5i32@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -5303,29 +4855,25 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 ; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_mov_b32_e32 v4, 5 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v5i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v5i32@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -5339,29 +4887,26 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 5 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v5i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v5i32@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -5380,9 +4925,8 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -5397,10 +4941,9 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -5414,30 +4957,27 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v8, s[34:35] ; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[34:35] offset:16 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v8i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v8i32@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -5451,32 +4991,28 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[0:1] ; GFX11-NEXT: global_load_b128 v[4:7], v4, s[0:1] offset:16 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v8i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v8i32@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -5490,30 +5026,27 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v8, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_clause 0x1 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v8, s[0:1] ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] offset:16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v8i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v8i32@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -5534,8 +5067,8 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1 @@ -5546,7 +5079,6 @@ ; GFX9-NEXT: v_mov_b32_e32 v5, 6 ; GFX9-NEXT: v_mov_b32_e32 v6, 7 ; GFX9-NEXT: v_mov_b32_e32 v7, 8 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v8i32@rel32@lo+4 @@ -5554,10 +5086,9 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -5571,20 +5102,19 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-NEXT: v_mov_b32_e32 v3, 4 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, 5 ; GFX10-NEXT: v_mov_b32_e32 v5, 6 ; GFX10-NEXT: v_mov_b32_e32 v6, 7 ; GFX10-NEXT: v_mov_b32_e32 v7, 8 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v8i32@rel32@lo+4 @@ -5592,11 +5122,9 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -5610,17 +5138,15 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 ; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v4, 5 :: v_dual_mov_b32 v5, 6 ; GFX11-NEXT: v_dual_mov_b32 v6, 7 :: v_dual_mov_b32 v7, 8 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v8i32@rel32@lo+4 @@ -5629,11 +5155,9 @@ ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -5647,20 +5171,19 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 5 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 6 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 7 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 8 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v8i32@rel32@lo+4 @@ -5668,11 +5191,9 @@ ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -5691,9 +5212,8 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v16, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -5710,10 +5230,9 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -5727,13 +5246,11 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v16, 0 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x3 @@ -5741,18 +5258,17 @@ ; GFX10-NEXT: global_load_dwordx4 v[4:7], v16, s[34:35] offset:16 ; GFX10-NEXT: global_load_dwordx4 v[8:11], v16, s[34:35] offset:32 ; GFX10-NEXT: global_load_dwordx4 v[12:15], v16, s[34:35] offset:48 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v16i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v16i32@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -5766,14 +5282,11 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v12, 0 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x3 @@ -5781,19 +5294,18 @@ ; GFX11-NEXT: global_load_b128 v[4:7], v12, s[0:1] offset:16 ; GFX11-NEXT: global_load_b128 v[8:11], v12, s[0:1] offset:32 ; GFX11-NEXT: global_load_b128 v[12:15], v12, s[0:1] offset:48 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v16i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v16i32@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -5807,13 +5319,11 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v16, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_clause 0x3 @@ -5821,18 +5331,17 @@ ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v16i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v16i32@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -5853,9 +5362,8 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v28, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -5877,10 +5385,9 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -5894,13 +5401,11 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v32, 0 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x7 @@ -5912,18 +5417,17 @@ ; GFX10-NEXT: global_load_dwordx4 v[20:23], v32, s[34:35] offset:80 ; GFX10-NEXT: global_load_dwordx4 v[24:27], v32, s[34:35] offset:96 ; GFX10-NEXT: global_load_dwordx4 v[28:31], v32, s[34:35] offset:112 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v32i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v32i32@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -5937,14 +5441,11 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v28, 0 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x7 @@ -5956,19 +5457,18 @@ ; GFX11-NEXT: global_load_b128 v[20:23], v28, s[0:1] offset:80 ; GFX11-NEXT: global_load_b128 v[24:27], v28, s[0:1] offset:96 ; GFX11-NEXT: global_load_b128 v[28:31], v28, s[0:1] offset:112 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v32i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v32i32@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -5982,13 +5482,11 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v32, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_clause 0x7 @@ -6000,18 +5498,17 @@ ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v32i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v32i32@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -6032,9 +5529,8 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v28, 0 ; GFX9-NEXT: global_load_dword v32, v[0:1], off @@ -6059,10 +5555,9 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -6076,13 +5571,11 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v32, 0 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: global_load_dword v33, v[0:1], off ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -6095,20 +5588,19 @@ ; GFX10-NEXT: global_load_dwordx4 v[20:23], v32, s[34:35] offset:80 ; GFX10-NEXT: global_load_dwordx4 v[24:27], v32, s[34:35] offset:96 ; GFX10-NEXT: global_load_dwordx4 v[28:31], v32, s[34:35] offset:112 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v32i32_i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v32i32_i32@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_waitcnt vmcnt(8) ; GFX10-NEXT: buffer_store_dword v33, off, s[0:3], s32 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -6122,14 +5614,11 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v28, 0 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: global_load_b32 v32, v[0:1], off ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -6142,20 +5631,19 @@ ; GFX11-NEXT: global_load_b128 v[20:23], v28, s[0:1] offset:80 ; GFX11-NEXT: global_load_b128 v[24:27], v28, s[0:1] offset:96 ; GFX11-NEXT: global_load_b128 v[28:31], v28, s[0:1] offset:112 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v32i32_i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v32i32_i32@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_waitcnt vmcnt(8) ; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -6169,13 +5657,11 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v32, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: global_load_dword v33, v[0:1], off ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) @@ -6188,20 +5674,19 @@ ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v32i32_i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v32i32_i32@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(8) ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v33, s32 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -6223,15 +5708,14 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0x800 +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v41, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: v_writelane_b32 v43, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: v_mov_b32_e32 v42, v1 ; GFX9-NEXT: s_getpc_b64 s[34:35] @@ -6244,12 +5728,11 @@ ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v43, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xf800 +; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6261,21 +5744,20 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v41, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 -; GFX10-NEXT: s_addk_i32 s32, 0x400 -; GFX10-NEXT: v_writelane_b32 v43, s34, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v42, v1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_i32_func_i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_i32_func_i32@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: global_store_dword v[41:42], v0, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6284,14 +5766,12 @@ ; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v43, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 -; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfc00 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -6302,23 +5782,20 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:8 -; GFX11-NEXT: scratch_store_b32 off, v43, s33 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:8 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: scratch_store_b32 off, v42, s33 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v42, v1 :: v_dual_mov_b32 v41, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 -; GFX11-NEXT: s_add_i32 s32, s32, 32 -; GFX11-NEXT: v_writelane_b32 v43, s0, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_i32_func_i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_i32_func_i32@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: global_store_b32 v[41:42], v0, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6327,13 +5804,11 @@ ; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v43, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:8 -; GFX11-NEXT: scratch_load_b32 v43, off, s33 offset:12 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:8 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_addk_i32 s32, 0xffe0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -6345,21 +5820,20 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 offset:8 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v43, s33 offset:12 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v42, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v41, v0 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 42 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v43, s0, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v42, v1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_i32_func_i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_i32_func_i32@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: global_store_dword v[41:42], v0, off ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6368,14 +5842,12 @@ ; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v43, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:8 -; GFX10-SCRATCH-NEXT: scratch_load_dword v43, off, s33 offset:12 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:8 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_addk_i32 s32, 0xffe0 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -6392,9 +5864,8 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -6409,10 +5880,9 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -6426,30 +5896,27 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_ubyte v0, v2, s[34:35] ; GFX10-NEXT: global_load_dword v1, v2, s[34:35] offset:4 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_struct_i8_i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_struct_i8_i32@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -6463,32 +5930,28 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_u8 v0, v1, s[0:1] ; GFX11-NEXT: global_load_b32 v1, v1, s[0:1] offset:4 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_struct_i8_i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_struct_i8_i32@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -6502,30 +5965,27 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_clause 0x1 ; GFX10-SCRATCH-NEXT: global_load_ubyte v0, v2, s[0:1] ; GFX10-SCRATCH-NEXT: global_load_dword v1, v2, s[0:1] offset:4 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_struct_i8_i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_struct_i8_i32@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -6546,16 +6006,15 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: v_mov_b32_e32 v0, 3 +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s33 ; GFX9-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-NEXT: s_addk_i32 s32, 0x800 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s33 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_byval_struct_i8_i32@rel32@lo+4 @@ -6563,12 +6022,11 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xf800 +; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6580,32 +6038,29 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 3 ; GFX10-NEXT: v_mov_b32_e32 v1, 8 +; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[34:35] +; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_byval_struct_i8_i32@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_byval_struct_i8_i32@rel32@hi+12 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: s_addk_i32 s32, 0x400 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s33 ; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 ; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_getpc_b64 s[34:35] -; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_byval_struct_i8_i32@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_byval_struct_i8_i32@rel32@hi+12 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfc00 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -6616,33 +6071,29 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:8 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:8 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 8 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_byval_struct_i8_i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_byval_struct_i8_i32@rel32@hi+12 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: s_add_i32 s32, s32, 32 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: scratch_store_b8 off, v0, s33 ; GFX11-NEXT: scratch_store_b32 off, v1, s33 offset:4 ; GFX11-NEXT: v_mov_b32_e32 v0, s33 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_byval_struct_i8_i32@rel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_byval_struct_i8_i32@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:8 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:12 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:8 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_addk_i32 s32, 0xffe0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -6654,32 +6105,29 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 offset:8 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:12 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 3 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 8 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_byval_struct_i8_i32@rel32@lo+4 +; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_byval_struct_i8_i32@rel32@hi+12 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s33 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v1, s33 offset:4 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, s33 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] -; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_byval_struct_i8_i32@rel32@lo+4 -; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_byval_struct_i8_i32@rel32@hi+12 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:8 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:12 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:8 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_addk_i32 s32, 0xffe0 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -6700,18 +6148,17 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: v_mov_b32_e32 v0, 3 ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s33 ; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s33 ; GFX9-NEXT: s_addk_i32 s32, 0x800 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_add_u32_e32 v0, 8, v0 ; GFX9-NEXT: v_lshrrev_b32_e64 v1, 6, s33 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 @@ -6721,7 +6168,7 @@ ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:12 ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -6729,7 +6176,6 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xf800 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -6743,39 +6189,36 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_mov_b32_e32 v0, 3 ; GFX10-NEXT: v_mov_b32_e32 v1, 8 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_addk_i32 s32, 0x400 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 +; GFX10-NEXT: s_getpc_b64 s[34:35] +; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s33 ; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 ; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_lshrrev_b32_e64 v1, 5, s33 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: s_getpc_b64 s[34:35] -; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 8, v0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: buffer_load_ubyte v0, off, s[0:3], s33 offset:8 ; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:12 ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_store_dword v[0:1], v1, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:20 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfc00 @@ -6789,13 +6232,11 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:16 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:16 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 8 ; GFX11-NEXT: s_add_i32 s32, s32, 32 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 @@ -6812,16 +6253,14 @@ ; GFX11-NEXT: scratch_load_b32 v1, off, s33 offset:12 ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b8 v[0:1], v0, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v[0:1], v1, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:16 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:20 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:16 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_addk_i32 s32, 0xffe0 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -6835,13 +6274,12 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 offset:16 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:20 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 3 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 8 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 @@ -6858,16 +6296,14 @@ ; GFX10-SCRATCH-NEXT: scratch_load_dword v1, off, s33 offset:12 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: global_store_byte v[0:1], v0, off ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: global_store_dword v[0:1], v1, off ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:16 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:20 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:16 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_addk_i32 s32, 0xffe0 @@ -6899,9 +6335,8 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -6934,10 +6369,9 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -6951,14 +6385,13 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[34:35] @@ -6988,11 +6421,9 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -7006,15 +6437,13 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v0, s[0:1] @@ -7041,11 +6470,9 @@ ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -7059,14 +6486,13 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1] @@ -7096,11 +6522,9 @@ ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -7117,7 +6541,7 @@ ; GFX9-LABEL: tail_call_byval_align16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s33 +; GFX9-NEXT: s_mov_b32 s6, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill @@ -7202,14 +6626,14 @@ ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0xf800 -; GFX9-NEXT: s_mov_b32 s33, s8 +; GFX9-NEXT: s_mov_b32 s33, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: tail_call_byval_align16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s7, s33 +; GFX10-NEXT: s_mov_b32 s6, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill @@ -7297,14 +6721,14 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_addk_i32 s32, 0xfc00 -; GFX10-NEXT: s_mov_b32 s33, s7 +; GFX10-NEXT: s_mov_b32 s33, s6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: tail_call_byval_align16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s5, s33 +; GFX11-NEXT: s_mov_b32 s4, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:24 ; 4-byte Folded Spill @@ -7387,14 +6811,14 @@ ; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:24 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_addk_i32 s32, 0xffe0 -; GFX11-NEXT: s_mov_b32 s33, s5 +; GFX11-NEXT: s_mov_b32 s33, s4 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SCRATCH-LABEL: tail_call_byval_align16: ; GFX10-SCRATCH: ; %bb.0: ; %entry ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, s33 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 offset:24 ; 4-byte Folded Spill @@ -7479,7 +6903,7 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_addk_i32 s32, 0xffe0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s5 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s4 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] entry: @@ -7497,12 +6921,11 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_i1_inreg@rel32@lo+4 @@ -7511,10 +6934,9 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -7528,26 +6950,23 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i1_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i1_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -7561,27 +6980,24 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: v_mov_b32_e32 v0, 1 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i1_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i1_inreg@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: scratch_store_b8 off, v0, s32 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -7595,26 +7011,23 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i1_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i1_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -7633,13 +7046,12 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 3 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7b -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_i8_inreg@rel32@lo+4 @@ -7648,10 +7060,9 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 3 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -7665,27 +7076,24 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_movk_i32 s4, 0x7b +; GFX10-NEXT: v_writelane_b32 v40, s34, 3 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i8_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i8_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_movk_i32 s4, 0x7b ; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 3 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -7699,17 +7107,15 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_movk_i32 s4, 0x7b +; GFX11-NEXT: v_writelane_b32 v40, s0, 3 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i8_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i8_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_movk_i32 s4, 0x7b ; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -7717,11 +7123,9 @@ ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 3 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -7735,27 +7139,24 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 3 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i8_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i8_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -7774,13 +7175,12 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 3 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7b -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_i16_inreg@rel32@lo+4 @@ -7789,10 +7189,9 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 3 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -7806,27 +7205,24 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_movk_i32 s4, 0x7b +; GFX10-NEXT: v_writelane_b32 v40, s34, 3 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i16_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i16_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_movk_i32 s4, 0x7b ; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 3 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -7840,17 +7236,15 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_movk_i32 s4, 0x7b +; GFX11-NEXT: v_writelane_b32 v40, s0, 3 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i16_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i16_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_movk_i32 s4, 0x7b ; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -7858,11 +7252,9 @@ ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 3 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -7876,27 +7268,24 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 3 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i16_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i16_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -7915,13 +7304,12 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 3 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: s_mov_b32 s4, 42 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_i32_inreg@rel32@lo+4 @@ -7930,10 +7318,9 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 3 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -7947,27 +7334,24 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 42 +; GFX10-NEXT: v_writelane_b32 v40, s34, 3 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i32_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 42 ; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 3 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -7981,17 +7365,15 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 42 +; GFX11-NEXT: v_writelane_b32 v40, s0, 3 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i32_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i32_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 42 ; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -7999,11 +7381,9 @@ ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 3 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -8017,27 +7397,24 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 42 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 3 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i32_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 42 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -8056,15 +7433,14 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 4 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: s_movk_i32 s4, 0x7b ; GFX9-NEXT: s_mov_b32 s5, 0 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_i64_inreg@rel32@lo+4 @@ -8074,10 +7450,9 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -8091,16 +7466,15 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_movk_i32 s4, 0x7b +; GFX10-NEXT: v_writelane_b32 v40, s34, 4 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i64_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i64_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_movk_i32 s4, 0x7b ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: v_writelane_b32 v40, s30, 2 @@ -8110,11 +7484,9 @@ ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -8128,17 +7500,15 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_movk_i32 s4, 0x7b +; GFX11-NEXT: v_writelane_b32 v40, s0, 4 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i64_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i64_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_movk_i32 s4, 0x7b ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: v_writelane_b32 v40, s30, 2 @@ -8149,11 +7519,9 @@ ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -8167,16 +7535,15 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i64_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i64_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 @@ -8186,11 +7553,9 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -8209,11 +7574,10 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 6 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 @@ -8231,10 +7595,9 @@ ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 6 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -8248,13 +7611,12 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 6 ; GFX10-NEXT: s_mov_b64 s[34:35], 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -8271,11 +7633,9 @@ ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 6 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -8288,15 +7648,13 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 -; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 6 ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 ; GFX11-NEXT: v_writelane_b32 v40, s7, 3 @@ -8314,11 +7672,9 @@ ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 6 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -8332,13 +7688,12 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 6 ; GFX10-SCRATCH-NEXT: s_mov_b64 s[0:1], 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -8355,11 +7710,9 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 6 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -8379,8 +7732,8 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 6 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 @@ -8391,7 +7744,6 @@ ; GFX9-NEXT: s_mov_b32 s5, 2 ; GFX9-NEXT: s_mov_b32 s6, 3 ; GFX9-NEXT: s_mov_b32 s7, 4 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 5 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v2i64_inreg@rel32@lo+4 @@ -8403,10 +7755,9 @@ ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 6 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -8420,16 +7771,15 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 1 +; GFX10-NEXT: v_writelane_b32 v40, s34, 6 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i64_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i64_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 1 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_mov_b32 s5, 2 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 @@ -8445,11 +7795,9 @@ ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 6 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -8463,17 +7811,15 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 1 +; GFX11-NEXT: v_writelane_b32 v40, s0, 6 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i64_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i64_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 1 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_mov_b32 s5, 2 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 @@ -8490,11 +7836,9 @@ ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 6 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -8508,16 +7852,15 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 6 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i64_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i64_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 @@ -8533,11 +7876,9 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 6 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -8556,11 +7897,10 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 8 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 @@ -8584,10 +7924,9 @@ ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 8 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -8601,13 +7940,12 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 8 ; GFX10-NEXT: s_mov_b64 s[34:35], 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -8630,11 +7968,9 @@ ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 8 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -8648,14 +7984,12 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 8 ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 ; GFX11-NEXT: v_writelane_b32 v40, s7, 3 @@ -8679,11 +8013,9 @@ ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 8 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -8697,13 +8029,12 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 8 ; GFX10-SCRATCH-NEXT: s_mov_b64 s[0:1], 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -8726,11 +8057,9 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 8 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -8752,12 +8081,11 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 10 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_writelane_b32 v40, s8, 4 @@ -8786,10 +8114,9 @@ ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 10 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -8803,13 +8130,12 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 10 ; GFX10-NEXT: s_mov_b64 s[34:35], 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -8838,11 +8164,9 @@ ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 10 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -8856,14 +8180,12 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 10 ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 ; GFX11-NEXT: v_writelane_b32 v40, s7, 3 @@ -8893,11 +8215,9 @@ ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 10 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -8911,13 +8231,12 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 10 ; GFX10-SCRATCH-NEXT: s_mov_b64 s[0:1], 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -8946,11 +8265,9 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 10 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -8971,13 +8288,12 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 3 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x4400 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_f16_inreg@rel32@lo+4 @@ -8986,10 +8302,9 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 3 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -9003,27 +8318,24 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_movk_i32 s4, 0x4400 +; GFX10-NEXT: v_writelane_b32 v40, s34, 3 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_f16_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_f16_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_movk_i32 s4, 0x4400 ; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 3 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -9037,17 +8349,15 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_movk_i32 s4, 0x4400 +; GFX11-NEXT: v_writelane_b32 v40, s0, 3 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_f16_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_f16_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_movk_i32 s4, 0x4400 ; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -9055,11 +8365,9 @@ ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 3 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -9073,27 +8381,24 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x4400 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 3 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f16_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_f16_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x4400 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -9112,13 +8417,12 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 3 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: s_mov_b32 s4, 4.0 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_f32_inreg@rel32@lo+4 @@ -9127,10 +8431,9 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 3 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -9144,27 +8447,24 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 4.0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 3 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_f32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_f32_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 4.0 ; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 3 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -9178,17 +8478,15 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 4.0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 3 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_f32_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_f32_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 4.0 ; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -9196,11 +8494,9 @@ ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 3 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -9214,27 +8510,24 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 4.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 3 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_f32_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 4.0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -9253,15 +8546,14 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 4 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: s_mov_b32 s4, 1.0 ; GFX9-NEXT: s_mov_b32 s5, 2.0 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v2f32_inreg@rel32@lo+4 @@ -9271,10 +8563,9 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -9288,16 +8579,15 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 1.0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 4 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2f32_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 1.0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_mov_b32 s5, 2.0 ; GFX10-NEXT: v_writelane_b32 v40, s30, 2 @@ -9307,11 +8597,9 @@ ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -9325,17 +8613,15 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 1.0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 4 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f32_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2f32_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 1.0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_mov_b32 s5, 2.0 ; GFX11-NEXT: v_writelane_b32 v40, s30, 2 @@ -9346,11 +8632,9 @@ ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -9364,16 +8648,15 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2f32_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 @@ -9383,11 +8666,9 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -9406,8 +8687,8 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 5 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 @@ -9416,7 +8697,6 @@ ; GFX9-NEXT: s_mov_b32 s4, 1.0 ; GFX9-NEXT: s_mov_b32 s5, 2.0 ; GFX9-NEXT: s_mov_b32 s6, 4.0 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 4 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v3f32_inreg@rel32@lo+4 @@ -9427,10 +8707,9 @@ ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 5 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -9444,16 +8723,15 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 1.0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 5 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3f32_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 1.0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_mov_b32 s5, 2.0 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 @@ -9466,11 +8744,9 @@ ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 5 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -9484,17 +8760,15 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 1.0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 5 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f32_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3f32_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 1.0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_mov_b32 s5, 2.0 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 @@ -9508,11 +8782,9 @@ ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 5 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -9526,16 +8798,15 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 5 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3f32_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 @@ -9548,11 +8819,9 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 5 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -9571,8 +8840,8 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 7 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 @@ -9585,7 +8854,6 @@ ; GFX9-NEXT: s_mov_b32 s6, 4.0 ; GFX9-NEXT: s_mov_b32 s7, -1.0 ; GFX9-NEXT: s_mov_b32 s8, 0.5 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 6 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v5f32_inreg@rel32@lo+4 @@ -9598,10 +8866,9 @@ ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 7 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -9615,16 +8882,15 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 1.0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 7 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v5f32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v5f32_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 1.0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_mov_b32 s5, 2.0 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 @@ -9643,11 +8909,9 @@ ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 7 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -9661,17 +8925,15 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 1.0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 7 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v5f32_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v5f32_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 1.0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_mov_b32 s5, 2.0 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 @@ -9691,11 +8953,9 @@ ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 7 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -9709,16 +8969,15 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 7 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v5f32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v5f32_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 @@ -9737,11 +8996,9 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 7 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -9760,15 +9017,14 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 4 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: s_mov_b32 s5, 0x40100000 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_f64_inreg@rel32@lo+4 @@ -9778,10 +9034,9 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -9795,16 +9050,15 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 4 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_f64_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_f64_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_mov_b32 s5, 0x40100000 ; GFX10-NEXT: v_writelane_b32 v40, s30, 2 @@ -9814,11 +9068,9 @@ ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -9832,17 +9084,15 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 4 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_f64_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_f64_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_mov_b32 s5, 0x40100000 ; GFX11-NEXT: v_writelane_b32 v40, s30, 2 @@ -9853,11 +9103,9 @@ ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -9871,16 +9119,15 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f64_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_f64_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0x40100000 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 @@ -9890,11 +9137,9 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -9913,8 +9158,8 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 6 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 @@ -9925,7 +9170,6 @@ ; GFX9-NEXT: s_mov_b32 s5, 2.0 ; GFX9-NEXT: s_mov_b32 s6, 0 ; GFX9-NEXT: s_mov_b32 s7, 0x40100000 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 5 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v2f64_inreg@rel32@lo+4 @@ -9937,10 +9181,9 @@ ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 6 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -9954,16 +9197,15 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 6 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f64_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2f64_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_mov_b32 s5, 2.0 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 @@ -9979,11 +9221,9 @@ ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 6 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -9997,17 +9237,15 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 6 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f64_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2f64_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_mov_b32 s5, 2.0 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 @@ -10024,11 +9262,9 @@ ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 6 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -10042,16 +9278,15 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 6 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f64_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2f64_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 @@ -10067,11 +9302,9 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 6 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -10090,8 +9323,8 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 8 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 @@ -10106,7 +9339,6 @@ ; GFX9-NEXT: s_mov_b32 s7, 0x40100000 ; GFX9-NEXT: s_mov_b32 s8, 0 ; GFX9-NEXT: s_mov_b32 s9, 0x40200000 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 7 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v3f64_inreg@rel32@lo+4 @@ -10120,10 +9352,9 @@ ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 8 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -10137,16 +9368,15 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 8 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f64_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3f64_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_mov_b32 s5, 2.0 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 @@ -10168,11 +9398,9 @@ ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 8 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -10186,17 +9414,15 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 8 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f64_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3f64_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_mov_b32 s5, 2.0 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 @@ -10219,11 +9445,9 @@ ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 -; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: v_readlane_b32 s0, v40, 8 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -10237,16 +9461,15 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 8 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f64_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3f64_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 @@ -10268,11 +9491,9 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 8 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -10291,13 +9512,12 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 3 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_load_dword s4, s[34:35], 0x0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v2i16_inreg@rel32@lo+4 @@ -10306,10 +9526,9 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 3 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -10323,13 +9542,12 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: v_writelane_b32 v40, s34, 3 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_load_dword s4, s[34:35], 0x0 -; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i16_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i16_inreg@rel32@hi+12 @@ -10339,11 +9557,9 @@ ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 3 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -10357,14 +9573,12 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: v_writelane_b32 v40, s0, 3 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 -; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i16_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i16_inreg@rel32@hi+12 @@ -10375,11 +9589,9 @@ ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 3 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -10393,13 +9605,12 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 3 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i16_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i16_inreg@rel32@hi+12 @@ -10409,11 +9620,9 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -10433,14 +9642,13 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 4 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v3i16_inreg@rel32@lo+4 @@ -10450,10 +9658,9 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -10467,12 +9674,11 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 4 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 ; GFX10-NEXT: s_getpc_b64 s[34:35] @@ -10485,11 +9691,9 @@ ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -10503,13 +9707,11 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 4 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX11-NEXT: s_getpc_b64 s[0:1] @@ -10523,11 +9725,9 @@ ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -10541,12 +9741,11 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] @@ -10559,11 +9758,9 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -10583,14 +9780,13 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 4 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v3f16_inreg@rel32@lo+4 @@ -10600,10 +9796,9 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -10617,12 +9812,11 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 4 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 ; GFX10-NEXT: s_getpc_b64 s[34:35] @@ -10635,11 +9829,9 @@ ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -10653,13 +9845,11 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 4 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX11-NEXT: s_getpc_b64 s[0:1] @@ -10673,11 +9863,9 @@ ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -10691,12 +9879,11 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] @@ -10709,11 +9896,9 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -10733,15 +9918,14 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 4 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: s_mov_b32 s4, 0x20001 ; GFX9-NEXT: s_mov_b32 s5, 3 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v3i16_inreg@rel32@lo+4 @@ -10751,10 +9935,9 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -10768,16 +9951,15 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 0x20001 +; GFX10-NEXT: v_writelane_b32 v40, s34, 4 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i16_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3i16_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 0x20001 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_mov_b32 s5, 3 ; GFX10-NEXT: v_writelane_b32 v40, s30, 2 @@ -10787,11 +9969,9 @@ ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -10805,17 +9985,15 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 0x20001 +; GFX11-NEXT: v_writelane_b32 v40, s0, 4 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i16_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i16_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 0x20001 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_mov_b32 s5, 3 ; GFX11-NEXT: v_writelane_b32 v40, s30, 2 @@ -10826,11 +10004,9 @@ ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -10844,16 +10020,15 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x20001 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i16_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i16_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x20001 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 3 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 @@ -10863,11 +10038,9 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -10886,15 +10059,14 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 4 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: s_mov_b32 s4, 0x40003c00 ; GFX9-NEXT: s_movk_i32 s5, 0x4400 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v3f16_inreg@rel32@lo+4 @@ -10904,10 +10076,9 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -10921,16 +10092,15 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 0x40003c00 +; GFX10-NEXT: v_writelane_b32 v40, s34, 4 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f16_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3f16_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 0x40003c00 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_movk_i32 s5, 0x4400 ; GFX10-NEXT: v_writelane_b32 v40, s30, 2 @@ -10940,11 +10110,9 @@ ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -10958,17 +10126,15 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 0x40003c00 +; GFX11-NEXT: v_writelane_b32 v40, s0, 4 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f16_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3f16_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 0x40003c00 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_movk_i32 s5, 0x4400 ; GFX11-NEXT: v_writelane_b32 v40, s30, 2 @@ -10979,11 +10145,9 @@ ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -10997,16 +10161,15 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x40003c00 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f16_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3f16_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x40003c00 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_movk_i32 s5, 0x4400 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 @@ -11016,11 +10179,9 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -11039,14 +10200,13 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 4 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v4i16_inreg@rel32@lo+4 @@ -11056,10 +10216,9 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -11073,12 +10232,11 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 4 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 ; GFX10-NEXT: s_getpc_b64 s[34:35] @@ -11091,11 +10249,9 @@ ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -11109,13 +10265,11 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 4 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX11-NEXT: s_getpc_b64 s[0:1] @@ -11129,11 +10283,9 @@ ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -11147,12 +10299,11 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] @@ -11165,11 +10316,9 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -11189,15 +10338,14 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 4 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: s_mov_b32 s4, 0x20001 ; GFX9-NEXT: s_mov_b32 s5, 0x40003 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v4i16_inreg@rel32@lo+4 @@ -11207,10 +10355,9 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -11224,16 +10371,15 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 0x20001 +; GFX10-NEXT: v_writelane_b32 v40, s34, 4 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i16_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v4i16_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 0x20001 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_mov_b32 s5, 0x40003 ; GFX10-NEXT: v_writelane_b32 v40, s30, 2 @@ -11243,11 +10389,9 @@ ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -11261,17 +10405,15 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 0x20001 +; GFX11-NEXT: v_writelane_b32 v40, s0, 4 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i16_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i16_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 0x20001 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_mov_b32 s5, 0x40003 ; GFX11-NEXT: v_writelane_b32 v40, s30, 2 @@ -11282,11 +10424,9 @@ ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -11300,16 +10440,15 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x20001 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i16_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i16_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x20001 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0x40003 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 @@ -11319,11 +10458,9 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -11342,13 +10479,12 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 3 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_load_dword s4, s[34:35], 0x0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v2f16_inreg@rel32@lo+4 @@ -11357,10 +10493,9 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 3 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -11374,13 +10509,12 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: v_writelane_b32 v40, s34, 3 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_load_dword s4, s[34:35], 0x0 -; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f16_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2f16_inreg@rel32@hi+12 @@ -11390,11 +10524,9 @@ ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 3 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -11408,14 +10540,12 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: v_writelane_b32 v40, s0, 3 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 -; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f16_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2f16_inreg@rel32@hi+12 @@ -11426,11 +10556,9 @@ ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 3 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -11444,13 +10572,12 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 3 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f16_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2f16_inreg@rel32@hi+12 @@ -11460,11 +10587,9 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -11484,14 +10609,13 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 4 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v2i32_inreg@rel32@lo+4 @@ -11501,10 +10625,9 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -11518,12 +10641,11 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 4 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 ; GFX10-NEXT: s_getpc_b64 s[34:35] @@ -11536,11 +10658,9 @@ ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -11554,13 +10674,11 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 4 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX11-NEXT: s_getpc_b64 s[0:1] @@ -11574,11 +10692,9 @@ ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -11592,12 +10708,11 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] @@ -11610,11 +10725,9 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -11634,15 +10747,14 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 4 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: s_mov_b32 s4, 1 ; GFX9-NEXT: s_mov_b32 s5, 2 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v2i32_inreg@rel32@lo+4 @@ -11652,10 +10764,9 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -11669,16 +10780,15 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 1 +; GFX10-NEXT: v_writelane_b32 v40, s34, 4 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i32_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 1 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_mov_b32 s5, 2 ; GFX10-NEXT: v_writelane_b32 v40, s30, 2 @@ -11688,11 +10798,9 @@ ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -11706,17 +10814,15 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 1 +; GFX11-NEXT: v_writelane_b32 v40, s0, 4 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i32_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i32_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 1 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_mov_b32 s5, 2 ; GFX11-NEXT: v_writelane_b32 v40, s30, 2 @@ -11727,11 +10833,9 @@ ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -11745,16 +10849,15 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i32_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 @@ -11764,11 +10867,9 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -11787,8 +10888,8 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 5 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 @@ -11797,7 +10898,6 @@ ; GFX9-NEXT: s_mov_b32 s4, 3 ; GFX9-NEXT: s_mov_b32 s5, 4 ; GFX9-NEXT: s_mov_b32 s6, 5 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 4 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v3i32_inreg@rel32@lo+4 @@ -11808,10 +10908,9 @@ ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 5 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -11825,16 +10924,15 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 3 +; GFX10-NEXT: v_writelane_b32 v40, s34, 5 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3i32_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 3 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_mov_b32 s5, 4 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 @@ -11847,11 +10945,9 @@ ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 5 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -11865,17 +10961,15 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 3 +; GFX11-NEXT: v_writelane_b32 v40, s0, 5 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 3 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_mov_b32 s5, 4 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 @@ -11889,11 +10983,9 @@ ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 5 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -11907,16 +10999,15 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 5 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 3 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 4 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 @@ -11929,11 +11020,9 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 5 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -11952,8 +11041,8 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 6 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 @@ -11964,7 +11053,6 @@ ; GFX9-NEXT: s_mov_b32 s5, 4 ; GFX9-NEXT: s_mov_b32 s6, 5 ; GFX9-NEXT: s_mov_b32 s7, 6 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 5 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v3i32_i32_inreg@rel32@lo+4 @@ -11976,10 +11064,9 @@ ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 6 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -11993,16 +11080,15 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 3 +; GFX10-NEXT: v_writelane_b32 v40, s34, 6 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i32_i32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3i32_i32_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 3 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_mov_b32 s5, 4 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 @@ -12018,11 +11104,9 @@ ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 6 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -12036,17 +11120,15 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 3 +; GFX11-NEXT: v_writelane_b32 v40, s0, 6 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_i32_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32_i32_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 3 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_mov_b32 s5, 4 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 @@ -12063,11 +11145,9 @@ ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 6 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -12081,16 +11161,15 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 6 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_i32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32_i32_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 3 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 4 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 @@ -12106,11 +11185,9 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 6 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -12129,8 +11206,8 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 6 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 @@ -12138,7 +11215,6 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 4 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 5 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v4i32_inreg@rel32@lo+4 @@ -12150,10 +11226,9 @@ ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 6 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -12167,12 +11242,11 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 6 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -12189,11 +11263,9 @@ ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 6 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -12207,13 +11279,11 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 6 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 ; GFX11-NEXT: v_writelane_b32 v40, s7, 3 @@ -12231,11 +11301,9 @@ ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 6 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -12249,12 +11317,11 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 6 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -12271,11 +11338,9 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 6 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -12295,8 +11360,8 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 6 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 @@ -12307,7 +11372,6 @@ ; GFX9-NEXT: s_mov_b32 s5, 2 ; GFX9-NEXT: s_mov_b32 s6, 3 ; GFX9-NEXT: s_mov_b32 s7, 4 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 5 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v4i32_inreg@rel32@lo+4 @@ -12319,10 +11383,9 @@ ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 6 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -12336,16 +11399,15 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 1 +; GFX10-NEXT: v_writelane_b32 v40, s34, 6 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v4i32_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 1 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_mov_b32 s5, 2 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 @@ -12361,11 +11423,9 @@ ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 6 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -12379,17 +11439,15 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 1 +; GFX11-NEXT: v_writelane_b32 v40, s0, 6 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i32_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i32_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 1 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_mov_b32 s5, 2 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 @@ -12406,11 +11464,9 @@ ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 6 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -12424,16 +11480,15 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 6 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i32_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 @@ -12449,11 +11504,9 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 6 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -12472,8 +11525,8 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 7 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 @@ -12486,7 +11539,6 @@ ; GFX9-NEXT: s_mov_b32 s6, 3 ; GFX9-NEXT: s_mov_b32 s7, 4 ; GFX9-NEXT: s_mov_b32 s8, 5 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 6 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v5i32_inreg@rel32@lo+4 @@ -12499,10 +11551,9 @@ ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 7 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -12516,16 +11567,15 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 1 +; GFX10-NEXT: v_writelane_b32 v40, s34, 7 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v5i32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v5i32_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 1 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_mov_b32 s5, 2 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 @@ -12544,11 +11594,9 @@ ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 7 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -12562,17 +11610,15 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 1 +; GFX11-NEXT: v_writelane_b32 v40, s0, 7 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v5i32_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v5i32_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 1 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_mov_b32 s5, 2 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 @@ -12592,11 +11638,9 @@ ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 7 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -12610,16 +11654,15 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 7 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v5i32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v5i32_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 @@ -12638,11 +11681,9 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 7 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -12661,11 +11702,10 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 10 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 @@ -12692,10 +11732,9 @@ ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 10 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -12709,13 +11748,12 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 10 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -12741,11 +11779,9 @@ ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 10 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -12759,14 +11795,12 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 10 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 ; GFX11-NEXT: v_writelane_b32 v40, s7, 3 @@ -12793,11 +11827,9 @@ ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 10 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -12811,13 +11843,12 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 10 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -12843,11 +11874,9 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 10 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -12868,8 +11897,8 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 10 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 @@ -12888,7 +11917,6 @@ ; GFX9-NEXT: s_mov_b32 s9, 6 ; GFX9-NEXT: s_mov_b32 s10, 7 ; GFX9-NEXT: s_mov_b32 s11, 8 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 9 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v8i32_inreg@rel32@lo+4 @@ -12904,10 +11932,9 @@ ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 10 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -12921,16 +11948,15 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 1 +; GFX10-NEXT: v_writelane_b32 v40, s34, 10 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v8i32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v8i32_inreg@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 1 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_mov_b32 s5, 2 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 @@ -12958,11 +11984,9 @@ ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 10 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -12976,17 +12000,15 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 1 +; GFX11-NEXT: v_writelane_b32 v40, s0, 10 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v8i32_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v8i32_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 1 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_mov_b32 s5, 2 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 @@ -13015,11 +12037,9 @@ ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 10 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -13033,16 +12053,15 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 10 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v8i32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v8i32_inreg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 @@ -13070,11 +12089,9 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 10 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -13093,8 +12110,8 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 18 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 @@ -13105,7 +12122,6 @@ ; GFX9-NEXT: v_writelane_b32 v40, s11, 7 ; GFX9-NEXT: v_writelane_b32 v40, s12, 8 ; GFX9-NEXT: v_writelane_b32 v40, s13, 9 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s14, 10 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s15, 11 @@ -13140,10 +12156,9 @@ ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 18 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -13157,13 +12172,12 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 18 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -13205,11 +12219,9 @@ ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 18 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -13223,14 +12235,12 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 18 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 ; GFX11-NEXT: v_writelane_b32 v40, s7, 3 @@ -13273,11 +12283,9 @@ ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 18 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -13291,13 +12299,12 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 18 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -13339,11 +12346,9 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 18 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -13364,8 +12369,8 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 28 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 @@ -13381,7 +12386,6 @@ ; GFX9-NEXT: v_writelane_b32 v40, s16, 12 ; GFX9-NEXT: v_writelane_b32 v40, s17, 13 ; GFX9-NEXT: v_writelane_b32 v40, s18, 14 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s19, 15 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s20, 16 @@ -13455,10 +12459,9 @@ ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 28 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -13472,13 +12475,12 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 28 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -13565,11 +12567,9 @@ ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 28 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -13583,16 +12583,14 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 28 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_add_i32 s2, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 ; GFX11-NEXT: v_writelane_b32 v40, s7, 3 @@ -13673,11 +12671,9 @@ ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 28 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -13691,14 +12687,13 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 28 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_add_i32 s2, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -13781,11 +12776,9 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 28 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -13806,8 +12799,8 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 28 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 @@ -13822,7 +12815,6 @@ ; GFX9-NEXT: v_writelane_b32 v40, s15, 11 ; GFX9-NEXT: v_writelane_b32 v40, s16, 12 ; GFX9-NEXT: v_writelane_b32 v40, s17, 13 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s18, 14 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s19, 15 @@ -13902,10 +12894,9 @@ ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 28 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -13919,13 +12910,12 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 28 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -14017,11 +13007,9 @@ ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 28 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -14035,14 +13023,12 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 28 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 ; GFX11-NEXT: v_writelane_b32 v40, s7, 3 @@ -14128,11 +13114,9 @@ ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 28 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -14146,13 +13130,12 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 28 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -14242,11 +13225,9 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 28 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -14268,13 +13249,12 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s33 ; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:4 -; GFX9-NEXT: s_addk_i32 s32, 0x800 +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, stack_passed_f64_arg@rel32@lo+4 @@ -14286,12 +13266,11 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xf800 +; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -14303,15 +13282,13 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s33 ; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:4 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: s_addk_i32 s32, 0x400 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, stack_passed_f64_arg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, stack_passed_f64_arg@rel32@hi+12 @@ -14319,18 +13296,17 @@ ; GFX10-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfc00 +; GFX10-NEXT: s_addk_i32 s32, 0xfe00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -14341,30 +13317,26 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:8 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:8 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: scratch_load_b64 v[32:33], off, s33 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: s_add_i32 s32, s32, 32 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, stack_passed_f64_arg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, stack_passed_f64_arg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: scratch_store_b64 off, v[32:33], s32 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:8 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:12 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:8 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_addk_i32 s32, 0xffe0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -14376,30 +13348,27 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 offset:8 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:12 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: scratch_load_dwordx2 v[32:33], off, s33 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, stack_passed_f64_arg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, stack_passed_f64_arg@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: scratch_store_dwordx2 off, v[32:33], s32 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:8 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:12 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:8 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_addk_i32 s32, 0xffe0 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -14416,7 +13385,6 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v0, 12 @@ -14424,6 +13392,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 13 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, 14 +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -14460,7 +13429,6 @@ ; GFX9-NEXT: v_mov_b32_e32 v29, 9 ; GFX9-NEXT: v_mov_b32_e32 v30, 10 ; GFX9-NEXT: v_mov_b32_e32 v31, 11 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_12xv3i32@rel32@lo+4 @@ -14468,10 +13436,9 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -14485,9 +13452,9 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 12 ; GFX10-NEXT: v_mov_b32_e32 v1, 13 ; GFX10-NEXT: v_mov_b32_e32 v2, 14 @@ -14530,7 +13497,6 @@ ; GFX10-NEXT: v_mov_b32_e32 v29, 9 ; GFX10-NEXT: v_mov_b32_e32 v30, 10 ; GFX10-NEXT: v_mov_b32_e32 v31, 11 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_12xv3i32@rel32@lo+4 @@ -14538,11 +13504,9 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -14556,10 +13520,9 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: v_dual_mov_b32 v0, 12 :: v_dual_mov_b32 v1, 13 ; GFX11-NEXT: v_dual_mov_b32 v2, 14 :: v_dual_mov_b32 v3, 15 ; GFX11-NEXT: s_add_i32 s32, s32, 16 @@ -14581,7 +13544,6 @@ ; GFX11-NEXT: v_dual_mov_b32 v26, 8 :: v_dual_mov_b32 v27, 9 ; GFX11-NEXT: v_dual_mov_b32 v28, 9 :: v_dual_mov_b32 v29, 9 ; GFX11-NEXT: v_dual_mov_b32 v30, 10 :: v_dual_mov_b32 v31, 11 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_12xv3i32@rel32@lo+4 @@ -14590,11 +13552,9 @@ ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -14608,9 +13568,9 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 12 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 13 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 14 @@ -14650,7 +13610,6 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v29, 9 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v30, 10 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v31, 11 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_12xv3i32@rel32@lo+4 @@ -14658,11 +13617,9 @@ ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -14694,7 +13651,6 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v0, 8 @@ -14710,6 +13666,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 13 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 ; GFX9-NEXT: v_mov_b32_e32 v0, 14 +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -14746,7 +13703,6 @@ ; GFX9-NEXT: v_mov_b32_e32 v29, 5 ; GFX9-NEXT: v_mov_b32_e32 v30, 6 ; GFX9-NEXT: v_mov_b32_e32 v31, 7 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_8xv5i32@rel32@lo+4 @@ -14754,10 +13710,9 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -14771,20 +13726,20 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_mov_b32_e32 v0, 8 ; GFX10-NEXT: v_mov_b32_e32 v1, 9 ; GFX10-NEXT: v_mov_b32_e32 v2, 10 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_mov_b32_e32 v3, 14 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; GFX10-NEXT: v_mov_b32_e32 v0, 11 ; GFX10-NEXT: v_mov_b32_e32 v1, 12 ; GFX10-NEXT: v_mov_b32_e32 v2, 13 +; GFX10-NEXT: v_mov_b32_e32 v3, 14 ; GFX10-NEXT: v_mov_b32_e32 v4, 15 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 @@ -14824,7 +13779,6 @@ ; GFX10-NEXT: v_mov_b32_e32 v29, 5 ; GFX10-NEXT: v_mov_b32_e32 v30, 6 ; GFX10-NEXT: v_mov_b32_e32 v31, 7 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_8xv5i32@rel32@lo+4 @@ -14832,11 +13786,9 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -14850,18 +13802,16 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 9 ; GFX11-NEXT: v_dual_mov_b32 v2, 10 :: v_dual_mov_b32 v3, 11 ; GFX11-NEXT: v_dual_mov_b32 v4, 12 :: v_dual_mov_b32 v5, 13 ; GFX11-NEXT: v_dual_mov_b32 v6, 14 :: v_dual_mov_b32 v7, 15 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 -; GFX11-NEXT: s_add_i32 s0, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_add_i32 s0, s32, 16 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 ; GFX11-NEXT: scratch_store_b128 off, v[4:7], s0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0 @@ -14888,11 +13838,9 @@ ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -14906,9 +13854,9 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 8 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 9 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 10 @@ -14918,9 +13866,8 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 14 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 15 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s0, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: s_add_i32 s0, s32, 16 ; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32 ; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[4:7], s0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 @@ -14962,11 +13909,9 @@ ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -14994,7 +13939,6 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x41000000 @@ -15010,6 +13954,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 0x41500000 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x41600000 +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x41700000 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -15046,7 +13991,6 @@ ; GFX9-NEXT: v_mov_b32_e32 v29, 0x40a00000 ; GFX9-NEXT: v_mov_b32_e32 v30, 0x40c00000 ; GFX9-NEXT: v_mov_b32_e32 v31, 0x40e00000 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_8xv5f32@rel32@lo+4 @@ -15054,10 +13998,9 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -15071,20 +14014,20 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x41000000 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x41100000 ; GFX10-NEXT: v_mov_b32_e32 v2, 0x41200000 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_mov_b32_e32 v3, 0x41600000 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x41300000 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x41400000 ; GFX10-NEXT: v_mov_b32_e32 v2, 0x41500000 +; GFX10-NEXT: v_mov_b32_e32 v3, 0x41600000 ; GFX10-NEXT: v_mov_b32_e32 v4, 0x41700000 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 @@ -15124,7 +14067,6 @@ ; GFX10-NEXT: v_mov_b32_e32 v29, 0x40a00000 ; GFX10-NEXT: v_mov_b32_e32 v30, 0x40c00000 ; GFX10-NEXT: v_mov_b32_e32 v31, 0x40e00000 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_8xv5f32@rel32@lo+4 @@ -15132,11 +14074,9 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -15150,10 +14090,9 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x41000000 ; GFX11-NEXT: v_mov_b32_e32 v1, 0x41100000 ; GFX11-NEXT: v_mov_b32_e32 v2, 0x41200000 @@ -15163,9 +14102,8 @@ ; GFX11-NEXT: v_mov_b32_e32 v6, 0x41600000 ; GFX11-NEXT: v_mov_b32_e32 v7, 0x41700000 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 -; GFX11-NEXT: s_add_i32 s0, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_add_i32 s0, s32, 16 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 ; GFX11-NEXT: scratch_store_b128 off, v[4:7], s0 ; GFX11-NEXT: v_mov_b32_e32 v6, 1.0 @@ -15194,11 +14132,9 @@ ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -15212,9 +14148,9 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x41000000 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x41100000 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0x41200000 @@ -15224,9 +14160,8 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 0x41600000 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 0x41700000 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s0, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: s_add_i32 s0, s32, 16 ; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32 ; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[4:7], s0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 @@ -15268,11 +14203,9 @@ ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll @@ -13,13 +13,12 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 4 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 @@ -32,10 +31,9 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -49,12 +47,11 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 4 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 @@ -69,11 +66,9 @@ ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 4 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -87,13 +82,11 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 4 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_getpc_b64 s[4:5] ; GFX11-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 @@ -109,11 +102,9 @@ ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 4 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -183,9 +174,9 @@ ; GFX11-LABEL: void_func_void_clobber_s28_s29: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_xor_saveexec_b32 s1, -1 +; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: v_writelane_b32 v0, s28, 0 ; GFX11-NEXT: v_writelane_b32 v0, s29, 1 ; GFX11-NEXT: v_writelane_b32 v0, s30, 2 @@ -201,9 +192,9 @@ ; GFX11-NEXT: v_readlane_b32 s30, v0, 2 ; GFX11-NEXT: v_readlane_b32 s29, v0, 1 ; GFX11-NEXT: v_readlane_b32 s28, v0, 0 -; GFX11-NEXT: s_xor_saveexec_b32 s1, -1 +; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload -; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] call void asm sideeffect "; clobber", "~{s[30:31]}"() #0 @@ -219,12 +210,11 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 3 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s31 @@ -241,10 +231,9 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 3 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -258,15 +247,14 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 3 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: ;;#ASMSTART @@ -281,11 +269,9 @@ ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 3 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -299,16 +285,14 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 3 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: ;;#ASMSTART @@ -323,11 +307,9 @@ ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 3 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -347,11 +329,10 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v42, s34, 0 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: ;;#ASMSTART @@ -369,10 +350,9 @@ ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v42, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -386,21 +366,20 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v42, s34, 0 ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def v31 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: v_mov_b32_e32 v41, v31 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_mov_b32_e32 v31, v41 ; GFX10-NEXT: ;;#ASMSTART @@ -409,11 +388,9 @@ ; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v42, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 -; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -427,23 +404,20 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:4 -; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:4 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v42, s0, 0 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 ; 4-byte Folded Spill ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; def v31 ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: v_mov_b32_e32 v41, v31 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_mov_b32_e32 v31, v41 ; GFX11-NEXT: ;;#ASMSTART @@ -452,11 +426,9 @@ ; GFX11-NEXT: scratch_load_b32 v41, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v42, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:4 -; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:8 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:4 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -477,12 +449,11 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 3 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s33 @@ -499,10 +470,9 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 3 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -516,20 +486,19 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 3 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s33 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, s33 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: s_mov_b32 s33, s4 @@ -539,11 +508,9 @@ ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 3 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -557,21 +524,19 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 3 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; def s33 ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_writelane_b32 v40, s30, 1 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, s33 +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_mov_b32 s33, s4 @@ -582,11 +547,9 @@ ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 3 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -606,10 +569,9 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 3 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: ;;#ASMSTART @@ -628,10 +590,9 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 3 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -645,20 +606,19 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 3 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[36:37] ; GFX10-NEXT: s_add_u32 s36, s36, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s37, s37, external_void_func_void@rel32@hi+12 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s34 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, s34 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[36:37] ; GFX10-NEXT: s_mov_b32 s34, s4 @@ -668,11 +628,9 @@ ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 3 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -686,21 +644,19 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 3 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; def s34 ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_writelane_b32 v40, s30, 1 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, s34 +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_mov_b32 s34, s4 @@ -711,11 +667,9 @@ ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 3 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -735,11 +689,10 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v41, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v41, s30, 0 -; GFX9-NEXT: v_writelane_b32 v42, s34, 0 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: v_writelane_b32 v41, s31, 1 ; GFX9-NEXT: ;;#ASMSTART @@ -755,10 +708,9 @@ ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s31, v41, 1 ; GFX9-NEXT: v_readlane_b32 s30, v41, 0 -; GFX9-NEXT: v_readlane_b32 s34, v42, 0 +; GFX9-NEXT: v_readlane_b32 s34, v41, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -772,20 +724,19 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v41, s30, 0 +; GFX10-NEXT: v_writelane_b32 v41, s34, 2 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v42, s34, 0 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def v40 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: v_writelane_b32 v41, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v41, s30, 0 +; GFX10-NEXT: v_writelane_b32 v41, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use v40 @@ -793,11 +744,9 @@ ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: v_readlane_b32 s31, v41, 1 ; GFX10-NEXT: v_readlane_b32 s30, v41, 0 -; GFX10-NEXT: v_readlane_b32 s34, v42, 0 +; GFX10-NEXT: v_readlane_b32 s34, v41, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 -; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -811,22 +760,19 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 -; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v41, s30, 0 +; GFX11-NEXT: v_writelane_b32 v41, s0, 2 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v42, s0, 0 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; def v40 ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_writelane_b32 v41, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v41, s30, 0 +; GFX11-NEXT: v_writelane_b32 v41, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use v40 @@ -834,11 +780,9 @@ ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: v_readlane_b32 s31, v41, 1 ; GFX11-NEXT: v_readlane_b32 s30, v41, 0 -; GFX11-NEXT: v_readlane_b32 s0, v42, 0 +; GFX11-NEXT: v_readlane_b32 s0, v41, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 -; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:8 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -854,54 +798,54 @@ ; GFX9-LABEL: void_func_void_clobber_s33: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_xor_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v0, s33, 0 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; clobber ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: v_readlane_b32 s33, v0, 0 -; GFX9-NEXT: s_xor_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: void_func_void_clobber_s33: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_xor_saveexec_b32 s5, -1 +; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s5 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v0, s33, 0 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; clobber ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: v_readlane_b32 s33, v0, 0 -; GFX10-NEXT: s_xor_saveexec_b32 s5, -1 +; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s5 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: void_func_void_clobber_s33: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_xor_saveexec_b32 s1, -1 +; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: v_writelane_b32 v0, s33, 0 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; clobber ; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s33, v0, 0 -; GFX11-NEXT: s_xor_saveexec_b32 s1, -1 +; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload -; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] call void asm sideeffect "; clobber", "~{s33}"() #0 @@ -912,54 +856,54 @@ ; GFX9-LABEL: void_func_void_clobber_s34: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_xor_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: v_writelane_b32 v0, s34, 0 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; clobber ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: v_readlane_b32 s34, v0, 0 -; GFX9-NEXT: s_xor_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: void_func_void_clobber_s34: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_xor_saveexec_b32 s5, -1 +; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s5 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: v_writelane_b32 v0, s34, 0 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; clobber ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: v_readlane_b32 s34, v0, 0 -; GFX10-NEXT: s_xor_saveexec_b32 s5, -1 +; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s5 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: void_func_void_clobber_s34: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_xor_saveexec_b32 s1, -1 +; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: v_writelane_b32 v0, s34, 0 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; clobber ; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s34, v0, 0 -; GFX11-NEXT: s_xor_saveexec_b32 s1, -1 +; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload -; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] call void asm sideeffect "; clobber", "~{s34}"() #0 @@ -974,11 +918,10 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, void_func_void_clobber_s33@rel32@lo+4 @@ -986,10 +929,9 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -1003,24 +945,21 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, void_func_void_clobber_s33@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, void_func_void_clobber_s33@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -1034,26 +973,22 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, void_func_void_clobber_s33@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, void_func_void_clobber_s33@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -1071,11 +1006,10 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, void_func_void_clobber_s34@rel32@lo+4 @@ -1083,10 +1017,9 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -1100,24 +1033,21 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, void_func_void_clobber_s34@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, void_func_void_clobber_s34@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -1131,26 +1061,22 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, void_func_void_clobber_s34@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, void_func_void_clobber_s34@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -1168,12 +1094,11 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 3 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s40 @@ -1189,10 +1114,9 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 3 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -1206,20 +1130,19 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 3 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s40 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, s40 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: ;;#ASMSTART @@ -1228,11 +1151,9 @@ ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 3 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -1246,21 +1167,19 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 3 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; def s40 ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_writelane_b32 v40, s30, 1 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, s40 +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: ;;#ASMSTART @@ -1270,11 +1189,9 @@ ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 3 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -1294,12 +1211,11 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 3 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 -; GFX9-NEXT: v_writelane_b32 v42, s34, 0 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: ;;#ASMSTART @@ -1324,10 +1240,9 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v42, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 3 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -1341,25 +1256,24 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v40, s34, 3 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v42, s34, 0 ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s40 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 -; GFX10-NEXT: s_mov_b32 s4, s40 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def v32 ; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, s40 ; GFX10-NEXT: v_mov_b32_e32 v41, v32 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: ;;#ASMSTART @@ -1372,11 +1286,9 @@ ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v42, 0 +; GFX10-NEXT: v_readlane_b32 s34, v40, 3 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 -; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -1390,26 +1302,24 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:4 -; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:4 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 3 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v42, s0, 0 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 ; 4-byte Folded Spill ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; def s40 ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_writelane_b32 v40, s30, 1 -; GFX11-NEXT: s_mov_b32 s4, s40 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; def v32 ; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, s40 ; GFX11-NEXT: v_mov_b32_e32 v41, v32 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: ;;#ASMSTART @@ -1422,11 +1332,9 @@ ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v42, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 3 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:4 -; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:8 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:4 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll @@ -79,7 +79,7 @@ ; GFX11-LABEL: call_i1: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s3, s33 +; GFX11-NEXT: s_mov_b32 s2, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_store_b32 off, v1, s33 ; 4-byte Folded Spill @@ -100,7 +100,7 @@ ; GFX11-NEXT: scratch_load_b32 v1, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: s_mov_b32 s33, s3 +; GFX11-NEXT: s_mov_b32 s33, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: @@ -184,7 +184,7 @@ ; GFX11-LABEL: call_i16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s3, s33 +; GFX11-NEXT: s_mov_b32 s2, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_store_b32 off, v1, s33 ; 4-byte Folded Spill @@ -205,7 +205,7 @@ ; GFX11-NEXT: scratch_load_b32 v1, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: s_mov_b32 s33, s3 +; GFX11-NEXT: s_mov_b32 s33, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: @@ -289,7 +289,7 @@ ; GFX11-LABEL: call_2xi16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s3, s33 +; GFX11-NEXT: s_mov_b32 s2, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_store_b32 off, v1, s33 ; 4-byte Folded Spill @@ -310,7 +310,7 @@ ; GFX11-NEXT: scratch_load_b32 v1, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: s_mov_b32 s33, s3 +; GFX11-NEXT: s_mov_b32 s33, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: @@ -402,7 +402,7 @@ ; GFX11-LABEL: call_3xi16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s3, s33 +; GFX11-NEXT: s_mov_b32 s2, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill @@ -423,7 +423,7 @@ ; GFX11-NEXT: scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: s_mov_b32 s33, s3 +; GFX11-NEXT: s_mov_b32 s33, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: @@ -890,7 +890,7 @@ ; GFX11-LABEL: call_100xi32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s3, s33 +; GFX11-NEXT: s_mov_b32 s2, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_store_b32 off, v100, s33 offset:128 ; 4-byte Folded Spill @@ -976,7 +976,7 @@ ; GFX11-NEXT: scratch_load_b32 v100, off, s33 offset:128 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_addk_i32 s32, 0xff70 -; GFX11-NEXT: s_mov_b32 s33, s3 +; GFX11-NEXT: s_mov_b32 s33, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: @@ -2835,7 +2835,7 @@ ; GFX9-NEXT: s_add_i32 s33, s32, 0x7fc0 ; GFX9-NEXT: s_and_b32 s33, s33, 0xffff8000 ; GFX9-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:1568 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_add_i32 s32, s32, 0x28000 ; GFX9-NEXT: s_getpc_b64 s[34:35] @@ -3001,28 +3001,28 @@ ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:600 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:604 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:1564 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:1568 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:608 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:1560 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:1564 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:612 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:1556 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:1560 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:616 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:1552 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:1556 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:620 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:1548 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:1552 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:624 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:1544 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:1548 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:628 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:1540 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:1544 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:632 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:1540 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:8 @@ -3072,14 +3072,14 @@ ; GFX9-NEXT: v_mov_b32_e32 v6, v28 ; GFX9-NEXT: v_mov_b32_e32 v7, v29 ; GFX9-NEXT: v_mov_b32_e32 v8, v30 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:1564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:1560 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:1556 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:1552 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:1548 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:1544 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:1540 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:1536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:1568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:1564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:1560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:1556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:1552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:1548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:1544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:1540 ; 4-byte Folded Reload ; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s33 ; GFX9-NEXT: v_add_u32_e32 v0, 0x400, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 @@ -3103,7 +3103,7 @@ ; GFX9-NEXT: v_readlane_b32 s31, v33, 1 ; GFX9-NEXT: v_readlane_b32 s30, v33, 0 ; GFX9-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:1568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:1536 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_add_i32 s32, s32, 0xfffd8000 ; GFX9-NEXT: s_mov_b32 s33, s36 @@ -3117,7 +3117,7 @@ ; GFX10-NEXT: s_add_i32 s33, s32, 0x3fe0 ; GFX10-NEXT: s_and_b32 s33, s33, 0xffffc000 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:1568 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_add_i32 s32, s32, 0x14000 @@ -3262,28 +3262,28 @@ ; GFX10-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:796 ; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:516 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1540 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:520 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1540 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1544 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:524 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1544 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1548 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:528 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1548 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1552 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:532 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1552 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1556 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:536 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1556 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1560 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:540 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1560 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1564 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:544 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1564 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1568 ; 4-byte Folded Spill ; GFX10-NEXT: s_clause 0x15 ; GFX10-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:548 ; GFX10-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:552 @@ -3350,14 +3350,14 @@ ; GFX10-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:156 ; GFX10-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:160 ; GFX10-NEXT: s_clause 0x7 -; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:1536 -; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:1540 -; GFX10-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:1544 -; GFX10-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:1548 -; GFX10-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:1552 -; GFX10-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:1556 -; GFX10-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:1560 -; GFX10-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:1564 +; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:1540 +; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:1544 +; GFX10-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:1548 +; GFX10-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:1552 +; GFX10-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:1556 +; GFX10-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:1560 +; GFX10-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:1564 +; GFX10-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:1568 ; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x400, v0 @@ -3381,7 +3381,7 @@ ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:1568 ; 4-byte Folded Reload +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:1536 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_add_i32 s32, s32, 0xfffec000 @@ -3397,7 +3397,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s33, s33, 0xfffffe00 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:1600 ; 4-byte Folded Spill +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:1536 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 @@ -3495,7 +3495,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(4) ; GFX11-NEXT: v_dual_mov_b32 v44, v62 :: v_dual_mov_b32 v57, v16 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b128 off, v[12:15], s33 offset:1584 ; 16-byte Folded Spill +; GFX11-NEXT: scratch_store_b128 off, v[12:15], s33 offset:1588 ; 16-byte Folded Spill ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: scratch_load_b128 v[12:15], off, s33 offset:528 ; GFX11-NEXT: scratch_load_b128 v[20:23], off, s33 offset:544 @@ -3511,13 +3511,13 @@ ; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: v_dual_mov_b32 v10, v21 :: v_dual_mov_b32 v15, v26 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b128 off, v[28:31], s33 offset:1568 ; 16-byte Folded Spill +; GFX11-NEXT: scratch_store_b128 off, v[28:31], s33 offset:1572 ; 16-byte Folded Spill ; GFX11-NEXT: scratch_load_b128 v[28:31], off, s33 offset:592 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b128 off, v[28:31], s33 offset:1552 ; 16-byte Folded Spill +; GFX11-NEXT: scratch_store_b128 off, v[28:31], s33 offset:1556 ; 16-byte Folded Spill ; GFX11-NEXT: scratch_load_b128 v[28:31], off, s33 offset:608 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b128 off, v[28:31], s33 offset:1536 ; 16-byte Folded Spill +; GFX11-NEXT: scratch_store_b128 off, v[28:31], s33 offset:1540 ; 16-byte Folded Spill ; GFX11-NEXT: scratch_store_b128 off, v[32:35], s32 ; GFX11-NEXT: v_mov_b32_e32 v32, v36 ; GFX11-NEXT: v_dual_mov_b32 v33, v48 :: v_dual_mov_b32 v34, v49 @@ -3560,13 +3560,13 @@ ; GFX11-NEXT: scratch_store_b128 off, v[36:39], s0 ; GFX11-NEXT: s_add_i32 s0, s32, 16 ; GFX11-NEXT: scratch_store_b128 off, v[32:35], s0 -; GFX11-NEXT: scratch_load_b128 v[1:4], off, s33 offset:1584 ; 16-byte Folded Reload +; GFX11-NEXT: scratch_load_b128 v[1:4], off, s33 offset:1588 ; 16-byte Folded Reload ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v1, 42 ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: scratch_load_b128 v[17:20], off, s33 offset:1568 -; GFX11-NEXT: scratch_load_b128 v[21:24], off, s33 offset:1552 -; GFX11-NEXT: scratch_load_b128 v[25:28], off, s33 offset:1536 +; GFX11-NEXT: scratch_load_b128 v[17:20], off, s33 offset:1572 +; GFX11-NEXT: scratch_load_b128 v[21:24], off, s33 offset:1556 +; GFX11-NEXT: scratch_load_b128 v[25:28], off, s33 offset:1540 ; GFX11-NEXT: s_add_i32 s0, s33, 0x400 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v0, s0 @@ -3590,7 +3590,7 @@ ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:1600 ; 4-byte Folded Reload +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:1536 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_addk_i32 s32, 0xf600 ; GFX11-NEXT: s_mov_b32 s33, s45 diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll --- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll @@ -397,9 +397,8 @@ ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[18:19] -; GCN-NEXT: v_writelane_b32 v41, s16, 0 +; GCN-NEXT: v_writelane_b32 v40, s16, 18 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 @@ -466,10 +465,9 @@ ; GCN-NEXT: v_readlane_b32 s34, v40, 2 ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 -; GCN-NEXT: v_readlane_b32 s4, v41, 0 +; GCN-NEXT: v_readlane_b32 s4, v40, 18 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s4 @@ -483,9 +481,8 @@ ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GISEL-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GISEL-NEXT: s_mov_b64 exec, s[18:19] -; GISEL-NEXT: v_writelane_b32 v41, s16, 0 +; GISEL-NEXT: v_writelane_b32 v40, s16, 18 ; GISEL-NEXT: s_addk_i32 s32, 0x400 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 ; GISEL-NEXT: v_writelane_b32 v40, s31, 1 @@ -552,10 +549,9 @@ ; GISEL-NEXT: v_readlane_b32 s34, v40, 2 ; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 -; GISEL-NEXT: v_readlane_b32 s4, v41, 0 +; GISEL-NEXT: v_readlane_b32 s4, v40, 18 ; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GISEL-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[6:7] ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 ; GISEL-NEXT: s_mov_b32 s33, s4 @@ -573,9 +569,8 @@ ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[18:19] -; GCN-NEXT: v_writelane_b32 v41, s16, 0 +; GCN-NEXT: v_writelane_b32 v40, s16, 18 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 @@ -645,10 +640,9 @@ ; GCN-NEXT: v_readlane_b32 s34, v40, 2 ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 -; GCN-NEXT: v_readlane_b32 s4, v41, 0 +; GCN-NEXT: v_readlane_b32 s4, v40, 18 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s4 @@ -662,9 +656,8 @@ ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GISEL-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GISEL-NEXT: s_mov_b64 exec, s[18:19] -; GISEL-NEXT: v_writelane_b32 v41, s16, 0 +; GISEL-NEXT: v_writelane_b32 v40, s16, 18 ; GISEL-NEXT: s_addk_i32 s32, 0x400 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 ; GISEL-NEXT: v_writelane_b32 v40, s31, 1 @@ -732,10 +725,9 @@ ; GISEL-NEXT: v_readlane_b32 s34, v40, 2 ; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 -; GISEL-NEXT: v_readlane_b32 s4, v41, 0 +; GISEL-NEXT: v_readlane_b32 s4, v40, 18 ; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GISEL-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[6:7] ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 ; GISEL-NEXT: s_mov_b32 s33, s4 @@ -753,9 +745,8 @@ ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[18:19] -; GCN-NEXT: v_writelane_b32 v41, s16, 0 +; GCN-NEXT: v_writelane_b32 v40, s16, 18 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 @@ -824,10 +815,9 @@ ; GCN-NEXT: v_readlane_b32 s34, v40, 2 ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 -; GCN-NEXT: v_readlane_b32 s4, v41, 0 +; GCN-NEXT: v_readlane_b32 s4, v40, 18 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s4 @@ -841,9 +831,8 @@ ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GISEL-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GISEL-NEXT: s_mov_b64 exec, s[18:19] -; GISEL-NEXT: v_writelane_b32 v41, s16, 0 +; GISEL-NEXT: v_writelane_b32 v40, s16, 18 ; GISEL-NEXT: s_addk_i32 s32, 0x400 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 ; GISEL-NEXT: v_writelane_b32 v40, s31, 1 @@ -912,10 +901,9 @@ ; GISEL-NEXT: v_readlane_b32 s34, v40, 2 ; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 -; GISEL-NEXT: v_readlane_b32 s4, v41, 0 +; GISEL-NEXT: v_readlane_b32 s4, v40, 18 ; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GISEL-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[6:7] ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 ; GISEL-NEXT: s_mov_b32 s33, s4 @@ -934,9 +922,8 @@ ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[18:19] -; GCN-NEXT: v_writelane_b32 v41, s16, 0 +; GCN-NEXT: v_writelane_b32 v40, s16, 20 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 @@ -1014,10 +1001,9 @@ ; GCN-NEXT: v_readlane_b32 s34, v40, 2 ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 -; GCN-NEXT: v_readlane_b32 s4, v41, 0 +; GCN-NEXT: v_readlane_b32 s4, v40, 20 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s4 @@ -1031,9 +1017,8 @@ ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GISEL-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GISEL-NEXT: s_mov_b64 exec, s[18:19] -; GISEL-NEXT: v_writelane_b32 v41, s16, 0 +; GISEL-NEXT: v_writelane_b32 v40, s16, 20 ; GISEL-NEXT: s_addk_i32 s32, 0x400 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 ; GISEL-NEXT: v_writelane_b32 v40, s31, 1 @@ -1111,10 +1096,9 @@ ; GISEL-NEXT: v_readlane_b32 s34, v40, 2 ; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 -; GISEL-NEXT: v_readlane_b32 s4, v41, 0 +; GISEL-NEXT: v_readlane_b32 s4, v40, 20 ; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GISEL-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[6:7] ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 ; GISEL-NEXT: s_mov_b32 s33, s4 @@ -1327,7 +1311,7 @@ ; GCN-LABEL: test_indirect_call_vgpr_ptr_arg_and_reuse: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s12, s33 +; GCN-NEXT: s_mov_b32 s10, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill @@ -1418,14 +1402,14 @@ ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s12 +; GCN-NEXT: s_mov_b32 s33, s10 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_indirect_call_vgpr_ptr_arg_and_reuse: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s12, s33 +; GISEL-NEXT: s_mov_b32 s10, s33 ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill @@ -1516,7 +1500,7 @@ ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 -; GISEL-NEXT: s_mov_b32 s33, s12 +; GISEL-NEXT: s_mov_b32 s33, s10 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void %fptr(i32 %i) @@ -1531,7 +1515,7 @@ ; GCN-LABEL: test_indirect_call_vgpr_ptr_arg_and_return: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s12, s33 +; GCN-NEXT: s_mov_b32 s10, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1620,14 +1604,14 @@ ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s12 +; GCN-NEXT: s_mov_b32 s33, s10 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_indirect_call_vgpr_ptr_arg_and_return: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s12, s33 +; GISEL-NEXT: s_mov_b32 s10, s33 ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1716,7 +1700,7 @@ ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 -; GISEL-NEXT: s_mov_b32 s33, s12 +; GISEL-NEXT: s_mov_b32 s33, s10 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] %ret = call amdgpu_gfx i32 %fptr(i32 %i) @@ -1728,7 +1712,7 @@ ; GCN-LABEL: test_indirect_tail_call_vgpr_ptr: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s12, s33 +; GCN-NEXT: s_mov_b32 s10, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1814,14 +1798,14 @@ ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s12 +; GCN-NEXT: s_mov_b32 s33, s10 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_indirect_tail_call_vgpr_ptr: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s12, s33 +; GISEL-NEXT: s_mov_b32 s10, s33 ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1907,7 +1891,7 @@ ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 -; GISEL-NEXT: s_mov_b32 s33, s12 +; GISEL-NEXT: s_mov_b32 s33, s10 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] tail call amdgpu_gfx void %fptr() diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll --- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll @@ -7,7 +7,7 @@ ; GFX11-LABEL: f0: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s3, s33 +; GFX11-NEXT: s_mov_b32 s2, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_store_b32 off, v4, s33 ; 4-byte Folded Spill @@ -28,7 +28,7 @@ ; GFX11-NEXT: scratch_load_b32 v4, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: s_mov_b32 s33, s3 +; GFX11-NEXT: s_mov_b32 s33, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] bb: @@ -53,20 +53,20 @@ ; GFX11-LABEL: f2: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_mov_b64 s[16:17], s[4:5] -; GFX11-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX11-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX11-NEXT: s_load_b32 s2, s[16:17], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v31, v0 +; GFX11-NEXT: s_load_b32 s24, s[16:17], 0x24 ; GFX11-NEXT: s_mov_b32 s18, s14 ; GFX11-NEXT: s_mov_b32 s12, s13 -; GFX11-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX11-NEXT: s_mov_b32 s20, 0 +; GFX11-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v31 +; GFX11-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX11-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX11-NEXT: s_mov_b32 s3, 0 ; GFX11-NEXT: s_mov_b32 s0, -1 ; GFX11-NEXT: s_mov_b32 s19, exec_lo ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mul_lo_u32 v0, s2, v0 +; GFX11-NEXT: v_mul_lo_u32 v0, s24, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11-NEXT: s_cbranch_execz .LBB2_13 @@ -74,7 +74,7 @@ ; GFX11-NEXT: s_load_b128 s[20:23], s[16:17], 0x2c ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_bitcmp1_b32 s21, 0 -; GFX11-NEXT: s_cselect_b32 s24, -1, 0 +; GFX11-NEXT: s_cselect_b32 s25, -1, 0 ; GFX11-NEXT: s_bitcmp0_b32 s21, 0 ; GFX11-NEXT: s_mov_b32 s21, 0 ; GFX11-NEXT: s_cbranch_scc0 .LBB2_3 @@ -90,40 +90,41 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_mov_b32 s1, -1 -; GFX11-NEXT: s_cbranch_execz .LBB2_4 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3 +; GFX11-NEXT: s_cbranch_vccz .LBB2_4 ; GFX11-NEXT: s_branch .LBB2_12 ; GFX11-NEXT: .LBB2_3: ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: .LBB2_4: ; %bb16 -; GFX11-NEXT: s_load_b32 s3, s[16:17], 0x54 +; GFX11-NEXT: s_load_b32 s2, s[16:17], 0x54 ; GFX11-NEXT: s_bitcmp1_b32 s23, 0 ; GFX11-NEXT: s_cselect_b32 s0, -1, 0 -; GFX11-NEXT: s_and_b32 s9, s23, 1 +; GFX11-NEXT: s_and_b32 s3, s23, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bitcmp1_b32 s3, 0 -; GFX11-NEXT: s_mov_b32 s3, -1 +; GFX11-NEXT: s_bitcmp1_b32 s2, 0 +; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: s_cselect_b32 s8, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s9, 0 +; GFX11-NEXT: s_cmp_eq_u32 s3, 0 ; GFX11-NEXT: s_cbranch_scc0 .LBB2_8 ; GFX11-NEXT: ; %bb.5: ; %bb18.preheader ; GFX11-NEXT: s_load_b128 s[28:31], s[16:17], 0x44 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mul_hi_u32 s3, s29, s28 -; GFX11-NEXT: s_mul_i32 s9, s29, s28 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_alignbit_b32 v0, s3, s9, 1 -; GFX11-NEXT: v_readfirstlane_b32 s3, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s24 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_or_b32 s3, s3, 1 -; GFX11-NEXT: s_lshr_b32 s3, s3, s30 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_mul_i32 s9, s3, s22 +; GFX11-NEXT: s_mul_hi_u32 s2, s29, s28 +; GFX11-NEXT: s_mul_i32 s3, s29, s28 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_alignbit_b32 v0, s2, s3, 1 ; GFX11-NEXT: s_mov_b32 s3, 0 -; GFX11-NEXT: s_mul_i32 s9, s9, s20 -; GFX11-NEXT: s_or_b32 s2, s2, s9 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s25 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s2, s2, 1 +; GFX11-NEXT: s_lshr_b32 s2, s2, s30 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mul_i32 s2, s2, s22 +; GFX11-NEXT: s_mul_i32 s2, s2, s20 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s2, s24, s2 ; GFX11-NEXT: s_lshl_b64 s[22:23], s[2:3], 1 ; GFX11-NEXT: global_load_u16 v2, v1, s[22:23] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -151,10 +152,10 @@ ; GFX11-NEXT: s_or_b32 s3, s2, s3 ; GFX11-NEXT: s_cbranch_vccz .LBB2_6 ; GFX11-NEXT: ; %bb.7: ; %Flow -; GFX11-NEXT: s_mov_b32 s3, 0 +; GFX11-NEXT: s_mov_b32 s2, 0 ; GFX11-NEXT: .LBB2_8: ; %Flow12 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s3 +; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2 ; GFX11-NEXT: s_cbranch_vccz .LBB2_12 ; GFX11-NEXT: ; %bb.9: ; GFX11-NEXT: s_xor_b32 s0, s8, -1 @@ -166,11 +167,11 @@ ; GFX11-NEXT: ; %bb.11: ; %Flow6 ; GFX11-NEXT: s_mov_b32 s21, -1 ; GFX11-NEXT: .LBB2_12: ; %Flow11 -; GFX11-NEXT: s_and_b32 s20, s1, exec_lo +; GFX11-NEXT: s_and_b32 s3, s1, exec_lo ; GFX11-NEXT: s_or_not1_b32 s0, s21, exec_lo ; GFX11-NEXT: .LBB2_13: ; %Flow9 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s19 -; GFX11-NEXT: s_and_saveexec_b32 s2, s0 +; GFX11-NEXT: s_and_saveexec_b32 s19, s0 ; GFX11-NEXT: s_cbranch_execz .LBB2_15 ; GFX11-NEXT: ; %bb.14: ; %bb43 ; GFX11-NEXT: s_add_u32 s8, s16, 0x58 @@ -183,10 +184,10 @@ ; GFX11-NEXT: s_mov_b32 s14, s15 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_or_b32 s20, s20, exec_lo +; GFX11-NEXT: s_or_b32 s3, s3, exec_lo ; GFX11-NEXT: .LBB2_15: ; %Flow14 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX11-NEXT: s_and_saveexec_b32 s0, s20 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s19 +; GFX11-NEXT: s_and_saveexec_b32 s0, s3 ; GFX11-NEXT: ; %bb.16: ; %UnifiedUnreachableBlock ; GFX11-NEXT: ; divergent unreachable ; GFX11-NEXT: ; %bb.17: ; %UnifiedReturnBlock diff --git a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll --- a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll @@ -13,17 +13,30 @@ ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; CHECK-NEXT: s_add_u32 s0, s0, s17 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: v_writelane_b32 v40, s16, 0 +; CHECK-NEXT: ; implicit-def: $vgpr3 +; CHECK-NEXT: v_writelane_b32 v3, s16, 0 +; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1 +; CHECK-NEXT: s_add_i32 s12, s33, 0x100200 +; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s12 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[34:35] ; CHECK-NEXT: s_mov_b32 s13, s15 ; CHECK-NEXT: s_mov_b32 s12, s14 -; CHECK-NEXT: v_readlane_b32 s14, v40, 0 +; CHECK-NEXT: v_readlane_b32 s14, v3, 0 ; CHECK-NEXT: s_mov_b64 s[16:17], s[8:9] ; CHECK-NEXT: v_mov_b32_e32 v3, v2 ; CHECK-NEXT: v_mov_b32_e32 v2, v1 ; CHECK-NEXT: v_mov_b32_e32 v1, v0 +; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1 +; CHECK-NEXT: s_add_i32 s8, s33, 0x100200 +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s8 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[34:35] ; CHECK-NEXT: s_load_dword s8, s[16:17], 0x0 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_writelane_b32 v40, s8, 1 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_writelane_b32 v0, s8, 1 +; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1 +; CHECK-NEXT: s_add_i32 s8, s33, 0x100200 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s8 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[34:35] ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def vgpr10 ; CHECK-NEXT: ;;#ASMEND @@ -56,9 +69,14 @@ ; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] +; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1 +; CHECK-NEXT: s_add_i32 s4, s33, 0x100200 +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[34:35] ; CHECK-NEXT: s_add_i32 s4, s33, 0x100100 ; CHECK-NEXT: buffer_load_dword v10, off, s[0:3], s4 ; 4-byte Folded Reload -; CHECK-NEXT: v_readlane_b32 s4, v40, 1 +; CHECK-NEXT: s_waitcnt vmcnt(1) +; CHECK-NEXT: v_readlane_b32 s4, v0, 1 ; CHECK-NEXT: s_mov_b32 s5, 0 ; CHECK-NEXT: s_cmp_eq_u32 s4, s5 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x4000 @@ -66,14 +84,24 @@ ; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], s33 offen ; 4-byte Folded Spill ; CHECK-NEXT: s_cbranch_scc1 .LBB0_2 ; CHECK-NEXT: ; %bb.1: ; %store +; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1 +; CHECK-NEXT: s_add_i32 s4, s33, 0x100200 +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[34:35] ; CHECK-NEXT: s_add_i32 s4, s33, 0x100000 -; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s4 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s4 ; 4-byte Folded Reload ; CHECK-NEXT: ; implicit-def: $sgpr4 -; CHECK-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-NEXT: v_mov_b32_e32 v1, s4 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: ds_write_b32 v0, v1 +; CHECK-NEXT: ds_write_b32 v1, v2 +; CHECK-NEXT: ; kill: killed $vgpr0 ; CHECK-NEXT: s_endpgm ; CHECK-NEXT: .LBB0_2: ; %end +; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1 +; CHECK-NEXT: s_add_i32 s4, s33, 0x100200 +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[34:35] +; CHECK-NEXT: ; kill: killed $vgpr0 ; CHECK-NEXT: s_endpgm %arr = alloca < 1339 x i32>, align 8192, addrspace(5) %cmp = icmp ne i32 %val, 0 diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll --- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll +++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll @@ -141,82 +141,108 @@ ; W64-O0: ; %bb.0: ; W64-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] -; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; W64-O0-NEXT: v_mov_b32_e32 v4, v3 +; W64-O0-NEXT: ; implicit-def: $vgpr5 +; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; W64-O0-NEXT: v_mov_b32_e32 v5, v3 ; W64-O0-NEXT: v_mov_b32_e32 v6, v2 ; W64-O0-NEXT: v_mov_b32_e32 v7, v1 +; W64-O0-NEXT: v_mov_b32_e32 v1, v0 +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v1, v7 -; W64-O0-NEXT: v_mov_b32_e32 v2, v6 -; W64-O0-NEXT: v_mov_b32_e32 v3, v4 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v2, v7 +; W64-O0-NEXT: v_mov_b32_e32 v3, v6 +; W64-O0-NEXT: v_mov_b32_e32 v4, v5 ; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 ; W64-O0-NEXT: s_mov_b32 s4, 0 -; W64-O0-NEXT: v_writelane_b32 v5, s4, 0 +; W64-O0-NEXT: v_writelane_b32 v0, s4, 0 ; W64-O0-NEXT: s_mov_b64 s[4:5], exec -; W64-O0-NEXT: v_writelane_b32 v5, s4, 1 -; W64-O0-NEXT: v_writelane_b32 v5, s5, 2 +; W64-O0-NEXT: v_writelane_b32 v0, s4, 1 +; W64-O0-NEXT: v_writelane_b32 v0, s5, 2 +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readfirstlane_b32 s8, v0 -; W64-O0-NEXT: v_readfirstlane_b32 s12, v1 +; W64-O0-NEXT: v_readfirstlane_b32 s8, v1 +; W64-O0-NEXT: v_readfirstlane_b32 s12, v2 ; W64-O0-NEXT: s_mov_b32 s4, s8 ; W64-O0-NEXT: s_mov_b32 s5, s12 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1] -; W64-O0-NEXT: v_readfirstlane_b32 s7, v2 -; W64-O0-NEXT: v_readfirstlane_b32 s6, v3 +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[1:2] +; W64-O0-NEXT: v_readfirstlane_b32 s7, v3 +; W64-O0-NEXT: v_readfirstlane_b32 s6, v4 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3] +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[3:4] ; W64-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[10:11] ; W64-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11 ; W64-O0-NEXT: s_mov_b32 s9, s12 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_writelane_b32 v5, s8, 3 -; W64-O0-NEXT: v_writelane_b32 v5, s9, 4 -; W64-O0-NEXT: v_writelane_b32 v5, s10, 5 -; W64-O0-NEXT: v_writelane_b32 v5, s11, 6 +; W64-O0-NEXT: v_writelane_b32 v0, s8, 3 +; W64-O0-NEXT: v_writelane_b32 v0, s9, 4 +; W64-O0-NEXT: v_writelane_b32 v0, s10, 5 +; W64-O0-NEXT: v_writelane_b32 v0, s11, 6 ; W64-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; W64-O0-NEXT: v_writelane_b32 v5, s4, 7 -; W64-O0-NEXT: v_writelane_b32 v5, s5, 8 +; W64-O0-NEXT: v_writelane_b32 v0, s4, 7 +; W64-O0-NEXT: v_writelane_b32 v0, s5, 8 +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: ; %bb.2: ; in Loop: Header=BB0_1 Depth=1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; W64-O0-NEXT: v_readlane_b32 s4, v5, 7 -; W64-O0-NEXT: v_readlane_b32 s5, v5, 8 -; W64-O0-NEXT: v_readlane_b32 s8, v5, 3 -; W64-O0-NEXT: v_readlane_b32 s9, v5, 4 -; W64-O0-NEXT: v_readlane_b32 s10, v5, 5 -; W64-O0-NEXT: v_readlane_b32 s11, v5, 6 -; W64-O0-NEXT: v_readlane_b32 s6, v5, 0 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: s_nop 3 +; W64-O0-NEXT: v_readlane_b32 s4, v1, 7 +; W64-O0-NEXT: v_readlane_b32 s5, v1, 8 +; W64-O0-NEXT: v_readlane_b32 s8, v1, 3 +; W64-O0-NEXT: v_readlane_b32 s9, v1, 4 +; W64-O0-NEXT: v_readlane_b32 s10, v1, 5 +; W64-O0-NEXT: v_readlane_b32 s11, v1, 6 +; W64-O0-NEXT: v_readlane_b32 s6, v1, 0 +; W64-O0-NEXT: s_nop 4 ; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] ; W64-O0-NEXT: s_cbranch_execnz .LBB0_1 ; W64-O0-NEXT: ; %bb.3: -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; W64-O0-NEXT: v_readlane_b32 s4, v5, 1 -; W64-O0-NEXT: v_readlane_b32 s5, v5, 2 +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; W64-O0-NEXT: s_waitcnt vmcnt(1) +; W64-O0-NEXT: v_readlane_b32 s4, v1, 1 +; W64-O0-NEXT: v_readlane_b32 s5, v1, 2 ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] +; W64-O0-NEXT: ; kill: killed $vgpr1 ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; W64-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; W64-O0-NEXT: s_nop 0 +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: s_setpc_b64 s[30:31] @@ -468,185 +494,221 @@ ; W64-O0: ; %bb.0: ; %entry ; W64-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; W64-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] -; W64-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; W64-O0-NEXT: ; implicit-def: $vgpr13 +; W64-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; W64-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; W64-O0-NEXT: v_mov_b32_e32 v14, v4 +; W64-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; W64-O0-NEXT: v_mov_b32_e32 v13, v4 ; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; W64-O0-NEXT: v_mov_b32_e32 v6, v3 -; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; W64-O0-NEXT: v_mov_b32_e32 v7, v2 -; W64-O0-NEXT: v_mov_b32_e32 v8, v1 +; W64-O0-NEXT: v_mov_b32_e32 v7, v3 +; W64-O0-NEXT: v_mov_b32_e32 v8, v2 +; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; W64-O0-NEXT: v_mov_b32_e32 v9, v1 ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; W64-O0-NEXT: v_mov_b32_e32 v2, v0 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; W64-O0-NEXT: v_mov_b32_e32 v3, v0 +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15_vgpr16_vgpr17 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v15, v5 +; W64-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14_vgpr15_vgpr16 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v14, v5 +; W64-O0-NEXT: v_mov_b32_e32 v15, v6 ; W64-O0-NEXT: s_waitcnt vmcnt(3) ; W64-O0-NEXT: v_mov_b32_e32 v16, v4 -; W64-O0-NEXT: s_waitcnt vmcnt(2) -; W64-O0-NEXT: v_mov_b32_e32 v17, v3 -; W64-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v3, v8 -; W64-O0-NEXT: v_mov_b32_e32 v4, v7 -; W64-O0-NEXT: v_mov_b32_e32 v5, v6 -; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4_vgpr5_vgpr6 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v4, v9 +; W64-O0-NEXT: v_mov_b32_e32 v5, v8 +; W64-O0-NEXT: v_mov_b32_e32 v6, v7 ; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v2, v12 -; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v3, v12 ; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v1, v10 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v2, v10 ; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 ; W64-O0-NEXT: s_mov_b32 s4, 0 -; W64-O0-NEXT: v_writelane_b32 v13, s4, 0 +; W64-O0-NEXT: v_writelane_b32 v0, s4, 0 ; W64-O0-NEXT: s_mov_b64 s[4:5], exec -; W64-O0-NEXT: v_writelane_b32 v13, s4, 1 -; W64-O0-NEXT: v_writelane_b32 v13, s5, 2 +; W64-O0-NEXT: v_writelane_b32 v0, s4, 1 +; W64-O0-NEXT: v_writelane_b32 v0, s5, 2 +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readfirstlane_b32 s8, v0 -; W64-O0-NEXT: v_readfirstlane_b32 s12, v1 +; W64-O0-NEXT: v_readfirstlane_b32 s8, v1 +; W64-O0-NEXT: v_readfirstlane_b32 s12, v2 ; W64-O0-NEXT: s_mov_b32 s4, s8 ; W64-O0-NEXT: s_mov_b32 s5, s12 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1] -; W64-O0-NEXT: v_readfirstlane_b32 s7, v2 -; W64-O0-NEXT: v_readfirstlane_b32 s6, v3 +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[1:2] +; W64-O0-NEXT: v_readfirstlane_b32 s7, v3 +; W64-O0-NEXT: v_readfirstlane_b32 s6, v4 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3] +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[3:4] ; W64-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[10:11] ; W64-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11 ; W64-O0-NEXT: s_mov_b32 s9, s12 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_writelane_b32 v13, s8, 3 -; W64-O0-NEXT: v_writelane_b32 v13, s9, 4 -; W64-O0-NEXT: v_writelane_b32 v13, s10, 5 -; W64-O0-NEXT: v_writelane_b32 v13, s11, 6 +; W64-O0-NEXT: v_writelane_b32 v0, s8, 3 +; W64-O0-NEXT: v_writelane_b32 v0, s9, 4 +; W64-O0-NEXT: v_writelane_b32 v0, s10, 5 +; W64-O0-NEXT: v_writelane_b32 v0, s11, 6 ; W64-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; W64-O0-NEXT: v_writelane_b32 v13, s4, 7 -; W64-O0-NEXT: v_writelane_b32 v13, s5, 8 +; W64-O0-NEXT: v_writelane_b32 v0, s4, 7 +; W64-O0-NEXT: v_writelane_b32 v0, s5, 8 +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: ; %bb.2: ; in Loop: Header=BB1_1 Depth=1 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; W64-O0-NEXT: v_readlane_b32 s4, v13, 7 -; W64-O0-NEXT: v_readlane_b32 s5, v13, 8 -; W64-O0-NEXT: v_readlane_b32 s8, v13, 3 -; W64-O0-NEXT: v_readlane_b32 s9, v13, 4 -; W64-O0-NEXT: v_readlane_b32 s10, v13, 5 -; W64-O0-NEXT: v_readlane_b32 s11, v13, 6 -; W64-O0-NEXT: v_readlane_b32 s6, v13, 0 +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: s_nop 3 +; W64-O0-NEXT: v_readlane_b32 s4, v1, 7 +; W64-O0-NEXT: v_readlane_b32 s5, v1, 8 +; W64-O0-NEXT: v_readlane_b32 s8, v1, 3 +; W64-O0-NEXT: v_readlane_b32 s9, v1, 4 +; W64-O0-NEXT: v_readlane_b32 s10, v1, 5 +; W64-O0-NEXT: v_readlane_b32 s11, v1, 6 +; W64-O0-NEXT: v_readlane_b32 s6, v1, 0 +; W64-O0-NEXT: s_nop 4 ; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] ; W64-O0-NEXT: s_cbranch_execnz .LBB1_1 ; W64-O0-NEXT: ; %bb.3: -; W64-O0-NEXT: v_readlane_b32 s4, v13, 1 -; W64-O0-NEXT: v_readlane_b32 s5, v13, 2 +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: v_readlane_b32 s4, v0, 1 +; W64-O0-NEXT: v_readlane_b32 s5, v0, 2 ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: s_mov_b64 s[4:5], exec -; W64-O0-NEXT: v_writelane_b32 v13, s4, 9 -; W64-O0-NEXT: v_writelane_b32 v13, s5, 10 +; W64-O0-NEXT: v_writelane_b32 v0, s4, 9 +; W64-O0-NEXT: v_writelane_b32 v0, s5, 10 +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: .LBB1_4: ; =>This Inner Loop Header: Depth=1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readfirstlane_b32 s8, v0 -; W64-O0-NEXT: v_readfirstlane_b32 s12, v1 +; W64-O0-NEXT: v_readfirstlane_b32 s8, v1 +; W64-O0-NEXT: v_readfirstlane_b32 s12, v2 ; W64-O0-NEXT: s_mov_b32 s4, s8 ; W64-O0-NEXT: s_mov_b32 s5, s12 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1] -; W64-O0-NEXT: v_readfirstlane_b32 s7, v2 -; W64-O0-NEXT: v_readfirstlane_b32 s6, v3 +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[1:2] +; W64-O0-NEXT: v_readfirstlane_b32 s7, v3 +; W64-O0-NEXT: v_readfirstlane_b32 s6, v4 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3] +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[3:4] ; W64-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[10:11] ; W64-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11 ; W64-O0-NEXT: s_mov_b32 s9, s12 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_writelane_b32 v13, s8, 11 -; W64-O0-NEXT: v_writelane_b32 v13, s9, 12 -; W64-O0-NEXT: v_writelane_b32 v13, s10, 13 -; W64-O0-NEXT: v_writelane_b32 v13, s11, 14 +; W64-O0-NEXT: v_writelane_b32 v0, s8, 11 +; W64-O0-NEXT: v_writelane_b32 v0, s9, 12 +; W64-O0-NEXT: v_writelane_b32 v0, s10, 13 +; W64-O0-NEXT: v_writelane_b32 v0, s11, 14 ; W64-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; W64-O0-NEXT: v_writelane_b32 v13, s4, 15 -; W64-O0-NEXT: v_writelane_b32 v13, s5, 16 +; W64-O0-NEXT: v_writelane_b32 v0, s4, 15 +; W64-O0-NEXT: v_writelane_b32 v0, s5, 16 +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: ; %bb.5: ; in Loop: Header=BB1_4 Depth=1 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; W64-O0-NEXT: v_readlane_b32 s4, v13, 15 -; W64-O0-NEXT: v_readlane_b32 s5, v13, 16 -; W64-O0-NEXT: v_readlane_b32 s8, v13, 11 -; W64-O0-NEXT: v_readlane_b32 s9, v13, 12 -; W64-O0-NEXT: v_readlane_b32 s10, v13, 13 -; W64-O0-NEXT: v_readlane_b32 s11, v13, 14 -; W64-O0-NEXT: v_readlane_b32 s6, v13, 0 +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: s_nop 3 +; W64-O0-NEXT: v_readlane_b32 s4, v1, 15 +; W64-O0-NEXT: v_readlane_b32 s5, v1, 16 +; W64-O0-NEXT: v_readlane_b32 s8, v1, 11 +; W64-O0-NEXT: v_readlane_b32 s9, v1, 12 +; W64-O0-NEXT: v_readlane_b32 s10, v1, 13 +; W64-O0-NEXT: v_readlane_b32 s11, v1, 14 +; W64-O0-NEXT: v_readlane_b32 s6, v1, 0 +; W64-O0-NEXT: s_nop 4 ; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] ; W64-O0-NEXT: s_cbranch_execnz .LBB1_4 ; W64-O0-NEXT: ; %bb.6: -; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; W64-O0-NEXT: v_readlane_b32 s4, v13, 9 -; W64-O0-NEXT: v_readlane_b32 s5, v13, 10 +; W64-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; W64-O0-NEXT: s_waitcnt vmcnt(6) +; W64-O0-NEXT: v_readlane_b32 s4, v0, 9 +; W64-O0-NEXT: v_readlane_b32 s5, v0, 10 ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: global_store_dword v[3:4], v5, off +; W64-O0-NEXT: global_store_dword v[4:5], v6, off ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: global_store_dword v[0:1], v2, off +; W64-O0-NEXT: global_store_dword v[1:2], v3, off ; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: ; kill: killed $vgpr0 ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; W64-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: s_setpc_b64 s[30:31] @@ -954,204 +1016,250 @@ ; W64-O0: ; %bb.0: ; %entry ; W64-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; W64-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] -; W64-O0-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; W64-O0-NEXT: v_mov_b32_e32 v9, v7 -; W64-O0-NEXT: v_mov_b32_e32 v10, v6 -; W64-O0-NEXT: v_mov_b32_e32 v11, v5 -; W64-O0-NEXT: v_mov_b32_e32 v5, v4 -; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; W64-O0-NEXT: ; implicit-def: $vgpr8 +; W64-O0-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; W64-O0-NEXT: v_mov_b32_e32 v8, v6 +; W64-O0-NEXT: v_mov_b32_e32 v9, v5 +; W64-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; W64-O0-NEXT: s_nop 0 -; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; W64-O0-NEXT: v_mov_b32_e32 v5, v3 -; W64-O0-NEXT: v_mov_b32_e32 v6, v2 -; W64-O0-NEXT: v_mov_b32_e32 v7, v1 -; W64-O0-NEXT: v_mov_b32_e32 v13, v0 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; W64-O0-NEXT: v_mov_b32_e32 v10, v3 +; W64-O0-NEXT: v_mov_b32_e32 v11, v2 +; W64-O0-NEXT: v_mov_b32_e32 v13, v1 +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; W64-O0-NEXT: v_mov_b32_e32 v6, v0 +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v1, v11 -; W64-O0-NEXT: v_mov_b32_e32 v2, v10 -; W64-O0-NEXT: v_mov_b32_e32 v3, v9 +; W64-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v2, v9 +; W64-O0-NEXT: v_mov_b32_e32 v3, v8 +; W64-O0-NEXT: v_mov_b32_e32 v4, v7 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14_vgpr15_vgpr16 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v14, v7 -; W64-O0-NEXT: v_mov_b32_e32 v15, v6 -; W64-O0-NEXT: v_mov_b32_e32 v16, v5 -; W64-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; W64-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7_vgpr8_vgpr9 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v7, v13 +; W64-O0-NEXT: v_mov_b32_e32 v8, v11 +; W64-O0-NEXT: v_mov_b32_e32 v9, v10 +; W64-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v5, v12 -; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v6, v12 ; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 ; W64-O0-NEXT: ;;#ASMSTART ; W64-O0-NEXT: s_mov_b32 s4, 17 ; W64-O0-NEXT: ;;#ASMEND ; W64-O0-NEXT: s_mov_b32 s5, s4 -; W64-O0-NEXT: v_writelane_b32 v8, s5, 0 +; W64-O0-NEXT: v_writelane_b32 v0, s5, 0 ; W64-O0-NEXT: s_mov_b32 s5, 0 -; W64-O0-NEXT: v_writelane_b32 v8, s5, 1 -; W64-O0-NEXT: v_mov_b32_e32 v0, s4 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: v_writelane_b32 v0, s5, 1 +; W64-O0-NEXT: v_mov_b32_e32 v1, s4 +; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 s[4:5], exec -; W64-O0-NEXT: v_writelane_b32 v8, s4, 2 -; W64-O0-NEXT: v_writelane_b32 v8, s5, 3 +; W64-O0-NEXT: v_writelane_b32 v0, s4, 2 +; W64-O0-NEXT: v_writelane_b32 v0, s5, 3 +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readfirstlane_b32 s8, v0 -; W64-O0-NEXT: v_readfirstlane_b32 s12, v1 +; W64-O0-NEXT: v_readfirstlane_b32 s8, v1 +; W64-O0-NEXT: v_readfirstlane_b32 s12, v2 ; W64-O0-NEXT: s_mov_b32 s4, s8 ; W64-O0-NEXT: s_mov_b32 s5, s12 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1] -; W64-O0-NEXT: v_readfirstlane_b32 s7, v2 -; W64-O0-NEXT: v_readfirstlane_b32 s6, v3 +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[1:2] +; W64-O0-NEXT: v_readfirstlane_b32 s7, v3 +; W64-O0-NEXT: v_readfirstlane_b32 s6, v4 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3] +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[3:4] ; W64-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[10:11] ; W64-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11 ; W64-O0-NEXT: s_mov_b32 s9, s12 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_writelane_b32 v8, s8, 4 -; W64-O0-NEXT: v_writelane_b32 v8, s9, 5 -; W64-O0-NEXT: v_writelane_b32 v8, s10, 6 -; W64-O0-NEXT: v_writelane_b32 v8, s11, 7 +; W64-O0-NEXT: v_writelane_b32 v0, s8, 4 +; W64-O0-NEXT: v_writelane_b32 v0, s9, 5 +; W64-O0-NEXT: v_writelane_b32 v0, s10, 6 +; W64-O0-NEXT: v_writelane_b32 v0, s11, 7 ; W64-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; W64-O0-NEXT: v_writelane_b32 v8, s4, 8 -; W64-O0-NEXT: v_writelane_b32 v8, s5, 9 +; W64-O0-NEXT: v_writelane_b32 v0, s4, 8 +; W64-O0-NEXT: v_writelane_b32 v0, s5, 9 +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: ; %bb.2: ; in Loop: Header=BB2_1 Depth=1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; W64-O0-NEXT: v_readlane_b32 s4, v8, 8 -; W64-O0-NEXT: v_readlane_b32 s5, v8, 9 -; W64-O0-NEXT: v_readlane_b32 s8, v8, 4 -; W64-O0-NEXT: v_readlane_b32 s9, v8, 5 -; W64-O0-NEXT: v_readlane_b32 s10, v8, 6 -; W64-O0-NEXT: v_readlane_b32 s11, v8, 7 -; W64-O0-NEXT: v_readlane_b32 s6, v8, 1 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: s_nop 3 +; W64-O0-NEXT: v_readlane_b32 s4, v1, 8 +; W64-O0-NEXT: v_readlane_b32 s5, v1, 9 +; W64-O0-NEXT: v_readlane_b32 s8, v1, 4 +; W64-O0-NEXT: v_readlane_b32 s9, v1, 5 +; W64-O0-NEXT: v_readlane_b32 s10, v1, 6 +; W64-O0-NEXT: v_readlane_b32 s11, v1, 7 +; W64-O0-NEXT: v_readlane_b32 s6, v1, 1 +; W64-O0-NEXT: s_nop 4 ; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] ; W64-O0-NEXT: s_cbranch_execnz .LBB2_1 ; W64-O0-NEXT: ; %bb.3: -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; W64-O0-NEXT: v_readlane_b32 s6, v8, 2 -; W64-O0-NEXT: v_readlane_b32 s7, v8, 3 +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; W64-O0-NEXT: s_waitcnt vmcnt(2) +; W64-O0-NEXT: v_readlane_b32 s6, v0, 2 +; W64-O0-NEXT: v_readlane_b32 s7, v0, 3 ; W64-O0-NEXT: s_mov_b64 exec, s[6:7] -; W64-O0-NEXT: v_readlane_b32 s4, v8, 1 +; W64-O0-NEXT: v_readlane_b32 s4, v0, 1 ; W64-O0-NEXT: s_mov_b32 s5, 0x3ff ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_and_b32_e64 v1, v1, s5 -; W64-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v1, s4 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; W64-O0-NEXT: v_and_b32_e64 v2, v2, s5 +; W64-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v2, s4 +; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 s[4:5], exec -; W64-O0-NEXT: v_writelane_b32 v8, s4, 10 -; W64-O0-NEXT: v_writelane_b32 v8, s5, 11 +; W64-O0-NEXT: v_writelane_b32 v0, s4, 10 +; W64-O0-NEXT: v_writelane_b32 v0, s5, 11 +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: s_cbranch_execz .LBB2_8 ; W64-O0-NEXT: ; %bb.4: ; %bb1 -; W64-O0-NEXT: v_readlane_b32 s4, v8, 0 +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: v_readlane_b32 s4, v0, 0 ; W64-O0-NEXT: s_mov_b32 s5, 0 -; W64-O0-NEXT: v_writelane_b32 v8, s5, 12 -; W64-O0-NEXT: v_mov_b32_e32 v0, s4 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; W64-O0-NEXT: v_writelane_b32 v0, s5, 12 +; W64-O0-NEXT: v_mov_b32_e32 v1, s4 +; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 ; W64-O0-NEXT: s_mov_b64 s[4:5], exec -; W64-O0-NEXT: v_writelane_b32 v8, s4, 13 -; W64-O0-NEXT: v_writelane_b32 v8, s5, 14 +; W64-O0-NEXT: v_writelane_b32 v0, s4, 13 +; W64-O0-NEXT: v_writelane_b32 v0, s5, 14 +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: .LBB2_5: ; =>This Inner Loop Header: Depth=1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readfirstlane_b32 s8, v0 -; W64-O0-NEXT: v_readfirstlane_b32 s12, v1 +; W64-O0-NEXT: v_readfirstlane_b32 s8, v1 +; W64-O0-NEXT: v_readfirstlane_b32 s12, v2 ; W64-O0-NEXT: s_mov_b32 s4, s8 ; W64-O0-NEXT: s_mov_b32 s5, s12 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1] -; W64-O0-NEXT: v_readfirstlane_b32 s7, v2 -; W64-O0-NEXT: v_readfirstlane_b32 s6, v3 +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[1:2] +; W64-O0-NEXT: v_readfirstlane_b32 s7, v3 +; W64-O0-NEXT: v_readfirstlane_b32 s6, v4 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3] +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[3:4] ; W64-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[10:11] ; W64-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11 ; W64-O0-NEXT: s_mov_b32 s9, s12 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_writelane_b32 v8, s8, 15 -; W64-O0-NEXT: v_writelane_b32 v8, s9, 16 -; W64-O0-NEXT: v_writelane_b32 v8, s10, 17 -; W64-O0-NEXT: v_writelane_b32 v8, s11, 18 +; W64-O0-NEXT: v_writelane_b32 v0, s8, 15 +; W64-O0-NEXT: v_writelane_b32 v0, s9, 16 +; W64-O0-NEXT: v_writelane_b32 v0, s10, 17 +; W64-O0-NEXT: v_writelane_b32 v0, s11, 18 ; W64-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; W64-O0-NEXT: v_writelane_b32 v8, s4, 19 -; W64-O0-NEXT: v_writelane_b32 v8, s5, 20 +; W64-O0-NEXT: v_writelane_b32 v0, s4, 19 +; W64-O0-NEXT: v_writelane_b32 v0, s5, 20 +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: ; %bb.6: ; in Loop: Header=BB2_5 Depth=1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; W64-O0-NEXT: v_readlane_b32 s4, v8, 19 -; W64-O0-NEXT: v_readlane_b32 s5, v8, 20 -; W64-O0-NEXT: v_readlane_b32 s8, v8, 15 -; W64-O0-NEXT: v_readlane_b32 s9, v8, 16 -; W64-O0-NEXT: v_readlane_b32 s10, v8, 17 -; W64-O0-NEXT: v_readlane_b32 s11, v8, 18 -; W64-O0-NEXT: v_readlane_b32 s6, v8, 12 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: s_nop 3 +; W64-O0-NEXT: v_readlane_b32 s4, v1, 19 +; W64-O0-NEXT: v_readlane_b32 s5, v1, 20 +; W64-O0-NEXT: v_readlane_b32 s8, v1, 15 +; W64-O0-NEXT: v_readlane_b32 s9, v1, 16 +; W64-O0-NEXT: v_readlane_b32 s10, v1, 17 +; W64-O0-NEXT: v_readlane_b32 s11, v1, 18 +; W64-O0-NEXT: v_readlane_b32 s6, v1, 12 +; W64-O0-NEXT: s_nop 4 ; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] ; W64-O0-NEXT: s_cbranch_execnz .LBB2_5 ; W64-O0-NEXT: ; %bb.7: -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; W64-O0-NEXT: v_readlane_b32 s4, v8, 13 -; W64-O0-NEXT: v_readlane_b32 s5, v8, 14 -; W64-O0-NEXT: s_mov_b64 exec, s[4:5] +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; W64-O0-NEXT: v_readlane_b32 s4, v1, 13 +; W64-O0-NEXT: v_readlane_b32 s5, v1, 14 +; W64-O0-NEXT: s_mov_b64 exec, s[4:5] +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; W64-O0-NEXT: .LBB2_8: ; %bb2 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; W64-O0-NEXT: s_nop 0 +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; W64-O0-NEXT: v_readlane_b32 s4, v8, 10 -; W64-O0-NEXT: v_readlane_b32 s5, v8, 11 +; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; W64-O0-NEXT: s_waitcnt vmcnt(3) +; W64-O0-NEXT: v_readlane_b32 s4, v0, 10 +; W64-O0-NEXT: v_readlane_b32 s5, v0, 11 ; W64-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: global_store_dword v[0:1], v2, off +; W64-O0-NEXT: global_store_dword v[1:2], v3, off ; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: ; kill: killed $vgpr0 ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; W64-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll --- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll +++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll @@ -140,94 +140,123 @@ ; W64-O0: ; %bb.0: ; W64-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] -; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; W64-O0-NEXT: ; implicit-def: $vgpr5 +; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; W64-O0-NEXT: v_mov_b32_e32 v6, v2 -; W64-O0-NEXT: v_mov_b32_e32 v2, v1 +; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; W64-O0-NEXT: v_mov_b32_e32 v3, v1 +; W64-O0-NEXT: v_mov_b32_e32 v1, v0 +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v7, v3 -; W64-O0-NEXT: v_mov_b32_e32 v4, v7 +; W64-O0-NEXT: s_waitcnt vmcnt(1) +; W64-O0-NEXT: v_mov_b32_e32 v7, v2 +; W64-O0-NEXT: v_mov_b32_e32 v5, v7 ; W64-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v1, v2 -; W64-O0-NEXT: v_mov_b32_e32 v7, v1 -; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec +; W64-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v2, v3 +; W64-O0-NEXT: v_mov_b32_e32 v7, v2 +; W64-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr1_vgpr2 killed $exec ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v1, v7 -; W64-O0-NEXT: v_mov_b32_e32 v2, v6 -; W64-O0-NEXT: v_mov_b32_e32 v3, v4 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v2, v7 +; W64-O0-NEXT: v_mov_b32_e32 v3, v6 +; W64-O0-NEXT: v_mov_b32_e32 v4, v5 ; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; W64-O0-NEXT: s_mov_b32 s4, 0 -; W64-O0-NEXT: v_writelane_b32 v5, s4, 0 +; W64-O0-NEXT: v_writelane_b32 v0, s4, 0 ; W64-O0-NEXT: s_mov_b64 s[4:5], exec -; W64-O0-NEXT: v_writelane_b32 v5, s4, 1 -; W64-O0-NEXT: v_writelane_b32 v5, s5, 2 +; W64-O0-NEXT: v_writelane_b32 v0, s4, 1 +; W64-O0-NEXT: v_writelane_b32 v0, s5, 2 +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readfirstlane_b32 s8, v0 -; W64-O0-NEXT: v_readfirstlane_b32 s12, v1 +; W64-O0-NEXT: v_readfirstlane_b32 s8, v1 +; W64-O0-NEXT: v_readfirstlane_b32 s12, v2 ; W64-O0-NEXT: s_mov_b32 s4, s8 ; W64-O0-NEXT: s_mov_b32 s5, s12 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1] -; W64-O0-NEXT: v_readfirstlane_b32 s7, v2 -; W64-O0-NEXT: v_readfirstlane_b32 s6, v3 +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[1:2] +; W64-O0-NEXT: v_readfirstlane_b32 s7, v3 +; W64-O0-NEXT: v_readfirstlane_b32 s6, v4 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3] +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[3:4] ; W64-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[10:11] ; W64-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11 ; W64-O0-NEXT: s_mov_b32 s9, s12 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_writelane_b32 v5, s8, 3 -; W64-O0-NEXT: v_writelane_b32 v5, s9, 4 -; W64-O0-NEXT: v_writelane_b32 v5, s10, 5 -; W64-O0-NEXT: v_writelane_b32 v5, s11, 6 +; W64-O0-NEXT: v_writelane_b32 v0, s8, 3 +; W64-O0-NEXT: v_writelane_b32 v0, s9, 4 +; W64-O0-NEXT: v_writelane_b32 v0, s10, 5 +; W64-O0-NEXT: v_writelane_b32 v0, s11, 6 ; W64-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; W64-O0-NEXT: v_writelane_b32 v5, s4, 7 -; W64-O0-NEXT: v_writelane_b32 v5, s5, 8 +; W64-O0-NEXT: v_writelane_b32 v0, s4, 7 +; W64-O0-NEXT: v_writelane_b32 v0, s5, 8 +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: ; %bb.2: ; in Loop: Header=BB0_1 Depth=1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; W64-O0-NEXT: v_readlane_b32 s4, v5, 7 -; W64-O0-NEXT: v_readlane_b32 s5, v5, 8 -; W64-O0-NEXT: v_readlane_b32 s8, v5, 3 -; W64-O0-NEXT: v_readlane_b32 s9, v5, 4 -; W64-O0-NEXT: v_readlane_b32 s10, v5, 5 -; W64-O0-NEXT: v_readlane_b32 s11, v5, 6 -; W64-O0-NEXT: v_readlane_b32 s6, v5, 0 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: s_nop 3 +; W64-O0-NEXT: v_readlane_b32 s4, v1, 7 +; W64-O0-NEXT: v_readlane_b32 s5, v1, 8 +; W64-O0-NEXT: v_readlane_b32 s8, v1, 3 +; W64-O0-NEXT: v_readlane_b32 s9, v1, 4 +; W64-O0-NEXT: v_readlane_b32 s10, v1, 5 +; W64-O0-NEXT: v_readlane_b32 s11, v1, 6 +; W64-O0-NEXT: v_readlane_b32 s6, v1, 0 +; W64-O0-NEXT: s_nop 4 ; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] ; W64-O0-NEXT: s_cbranch_execnz .LBB0_1 ; W64-O0-NEXT: ; %bb.3: -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; W64-O0-NEXT: v_readlane_b32 s4, v5, 1 -; W64-O0-NEXT: v_readlane_b32 s5, v5, 2 +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; W64-O0-NEXT: s_waitcnt vmcnt(1) +; W64-O0-NEXT: v_readlane_b32 s4, v1, 1 +; W64-O0-NEXT: v_readlane_b32 s5, v1, 2 ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] +; W64-O0-NEXT: ; kill: killed $vgpr1 ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; W64-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; W64-O0-NEXT: s_nop 0 +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: s_setpc_b64 s[30:31] @@ -479,86 +508,89 @@ ; W64-O0: ; %bb.0: ; %entry ; W64-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; W64-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] -; W64-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; W64-O0-NEXT: ; implicit-def: $vgpr13 +; W64-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; W64-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; W64-O0-NEXT: v_mov_b32_e32 v15, v6 -; W64-O0-NEXT: v_mov_b32_e32 v8, v5 -; W64-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; W64-O0-NEXT: v_mov_b32_e32 v14, v4 -; W64-O0-NEXT: v_mov_b32_e32 v7, v2 -; W64-O0-NEXT: v_mov_b32_e32 v4, v1 +; W64-O0-NEXT: v_mov_b32_e32 v14, v6 +; W64-O0-NEXT: v_mov_b32_e32 v9, v5 +; W64-O0-NEXT: v_mov_b32_e32 v13, v4 +; W64-O0-NEXT: v_mov_b32_e32 v4, v3 +; W64-O0-NEXT: v_mov_b32_e32 v8, v2 +; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; W64-O0-NEXT: v_mov_b32_e32 v5, v1 ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; W64-O0-NEXT: v_mov_b32_e32 v2, v0 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; W64-O0-NEXT: v_mov_b32_e32 v3, v0 +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec -; W64-O0-NEXT: s_waitcnt vmcnt(2) -; W64-O0-NEXT: v_mov_b32_e32 v16, v5 -; W64-O0-NEXT: v_mov_b32_e32 v5, v16 +; W64-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v15, v7 ; W64-O0-NEXT: v_mov_b32_e32 v6, v15 +; W64-O0-NEXT: v_mov_b32_e32 v7, v14 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v15, v8 -; W64-O0-NEXT: v_mov_b32_e32 v8, v15 -; W64-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec +; W64-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v14, v9 +; W64-O0-NEXT: v_mov_b32_e32 v9, v14 +; W64-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 killed $vgpr13_vgpr14 killed $exec ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15_vgpr16_vgpr17 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v15, v8 +; W64-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14_vgpr15_vgpr16 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v14, v9 +; W64-O0-NEXT: v_mov_b32_e32 v15, v7 ; W64-O0-NEXT: v_mov_b32_e32 v16, v6 -; W64-O0-NEXT: v_mov_b32_e32 v17, v5 -; W64-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v8, v3 -; W64-O0-NEXT: v_mov_b32_e32 v6, v8 -; W64-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $vgpr7_vgpr8 killed $exec +; W64-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v9, v4 +; W64-O0-NEXT: v_mov_b32_e32 v7, v9 +; W64-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 killed $vgpr8_vgpr9 killed $exec ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v3, v4 -; W64-O0-NEXT: v_mov_b32_e32 v8, v3 -; W64-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr2_vgpr3 killed $exec +; W64-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v4, v5 +; W64-O0-NEXT: v_mov_b32_e32 v9, v4 +; W64-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr3_vgpr4 killed $exec ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v3, v8 -; W64-O0-NEXT: v_mov_b32_e32 v4, v7 -; W64-O0-NEXT: v_mov_b32_e32 v5, v6 -; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4_vgpr5_vgpr6 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v4, v9 +; W64-O0-NEXT: v_mov_b32_e32 v5, v8 +; W64-O0-NEXT: v_mov_b32_e32 v6, v7 ; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v2, v12 -; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v3, v12 ; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v1, v10 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v2, v10 ; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 @@ -566,122 +598,156 @@ ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; W64-O0-NEXT: s_mov_b32 s4, 0 -; W64-O0-NEXT: v_writelane_b32 v13, s4, 0 +; W64-O0-NEXT: v_writelane_b32 v0, s4, 0 ; W64-O0-NEXT: s_mov_b64 s[4:5], exec -; W64-O0-NEXT: v_writelane_b32 v13, s4, 1 -; W64-O0-NEXT: v_writelane_b32 v13, s5, 2 +; W64-O0-NEXT: v_writelane_b32 v0, s4, 1 +; W64-O0-NEXT: v_writelane_b32 v0, s5, 2 +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readfirstlane_b32 s8, v0 -; W64-O0-NEXT: v_readfirstlane_b32 s12, v1 +; W64-O0-NEXT: v_readfirstlane_b32 s8, v1 +; W64-O0-NEXT: v_readfirstlane_b32 s12, v2 ; W64-O0-NEXT: s_mov_b32 s4, s8 ; W64-O0-NEXT: s_mov_b32 s5, s12 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1] -; W64-O0-NEXT: v_readfirstlane_b32 s7, v2 -; W64-O0-NEXT: v_readfirstlane_b32 s6, v3 +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[1:2] +; W64-O0-NEXT: v_readfirstlane_b32 s7, v3 +; W64-O0-NEXT: v_readfirstlane_b32 s6, v4 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3] +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[3:4] ; W64-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[10:11] ; W64-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11 ; W64-O0-NEXT: s_mov_b32 s9, s12 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_writelane_b32 v13, s8, 3 -; W64-O0-NEXT: v_writelane_b32 v13, s9, 4 -; W64-O0-NEXT: v_writelane_b32 v13, s10, 5 -; W64-O0-NEXT: v_writelane_b32 v13, s11, 6 +; W64-O0-NEXT: v_writelane_b32 v0, s8, 3 +; W64-O0-NEXT: v_writelane_b32 v0, s9, 4 +; W64-O0-NEXT: v_writelane_b32 v0, s10, 5 +; W64-O0-NEXT: v_writelane_b32 v0, s11, 6 ; W64-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; W64-O0-NEXT: v_writelane_b32 v13, s4, 7 -; W64-O0-NEXT: v_writelane_b32 v13, s5, 8 +; W64-O0-NEXT: v_writelane_b32 v0, s4, 7 +; W64-O0-NEXT: v_writelane_b32 v0, s5, 8 +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: ; %bb.2: ; in Loop: Header=BB1_1 Depth=1 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; W64-O0-NEXT: v_readlane_b32 s4, v13, 7 -; W64-O0-NEXT: v_readlane_b32 s5, v13, 8 -; W64-O0-NEXT: v_readlane_b32 s8, v13, 3 -; W64-O0-NEXT: v_readlane_b32 s9, v13, 4 -; W64-O0-NEXT: v_readlane_b32 s10, v13, 5 -; W64-O0-NEXT: v_readlane_b32 s11, v13, 6 -; W64-O0-NEXT: v_readlane_b32 s6, v13, 0 +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: s_nop 3 +; W64-O0-NEXT: v_readlane_b32 s4, v1, 7 +; W64-O0-NEXT: v_readlane_b32 s5, v1, 8 +; W64-O0-NEXT: v_readlane_b32 s8, v1, 3 +; W64-O0-NEXT: v_readlane_b32 s9, v1, 4 +; W64-O0-NEXT: v_readlane_b32 s10, v1, 5 +; W64-O0-NEXT: v_readlane_b32 s11, v1, 6 +; W64-O0-NEXT: v_readlane_b32 s6, v1, 0 +; W64-O0-NEXT: s_nop 4 ; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] ; W64-O0-NEXT: s_cbranch_execnz .LBB1_1 ; W64-O0-NEXT: ; %bb.3: -; W64-O0-NEXT: v_readlane_b32 s4, v13, 1 -; W64-O0-NEXT: v_readlane_b32 s5, v13, 2 +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: v_readlane_b32 s4, v0, 1 +; W64-O0-NEXT: v_readlane_b32 s5, v0, 2 ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: s_mov_b64 s[4:5], exec -; W64-O0-NEXT: v_writelane_b32 v13, s4, 9 -; W64-O0-NEXT: v_writelane_b32 v13, s5, 10 +; W64-O0-NEXT: v_writelane_b32 v0, s4, 9 +; W64-O0-NEXT: v_writelane_b32 v0, s5, 10 +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: .LBB1_4: ; =>This Inner Loop Header: Depth=1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readfirstlane_b32 s8, v0 -; W64-O0-NEXT: v_readfirstlane_b32 s12, v1 +; W64-O0-NEXT: v_readfirstlane_b32 s8, v1 +; W64-O0-NEXT: v_readfirstlane_b32 s12, v2 ; W64-O0-NEXT: s_mov_b32 s4, s8 ; W64-O0-NEXT: s_mov_b32 s5, s12 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1] -; W64-O0-NEXT: v_readfirstlane_b32 s7, v2 -; W64-O0-NEXT: v_readfirstlane_b32 s6, v3 +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[1:2] +; W64-O0-NEXT: v_readfirstlane_b32 s7, v3 +; W64-O0-NEXT: v_readfirstlane_b32 s6, v4 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3] +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[3:4] ; W64-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[10:11] ; W64-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11 ; W64-O0-NEXT: s_mov_b32 s9, s12 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_writelane_b32 v13, s8, 11 -; W64-O0-NEXT: v_writelane_b32 v13, s9, 12 -; W64-O0-NEXT: v_writelane_b32 v13, s10, 13 -; W64-O0-NEXT: v_writelane_b32 v13, s11, 14 +; W64-O0-NEXT: v_writelane_b32 v0, s8, 11 +; W64-O0-NEXT: v_writelane_b32 v0, s9, 12 +; W64-O0-NEXT: v_writelane_b32 v0, s10, 13 +; W64-O0-NEXT: v_writelane_b32 v0, s11, 14 ; W64-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; W64-O0-NEXT: v_writelane_b32 v13, s4, 15 -; W64-O0-NEXT: v_writelane_b32 v13, s5, 16 +; W64-O0-NEXT: v_writelane_b32 v0, s4, 15 +; W64-O0-NEXT: v_writelane_b32 v0, s5, 16 +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: ; %bb.5: ; in Loop: Header=BB1_4 Depth=1 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; W64-O0-NEXT: v_readlane_b32 s4, v13, 15 -; W64-O0-NEXT: v_readlane_b32 s5, v13, 16 -; W64-O0-NEXT: v_readlane_b32 s8, v13, 11 -; W64-O0-NEXT: v_readlane_b32 s9, v13, 12 -; W64-O0-NEXT: v_readlane_b32 s10, v13, 13 -; W64-O0-NEXT: v_readlane_b32 s11, v13, 14 -; W64-O0-NEXT: v_readlane_b32 s6, v13, 0 +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: s_nop 3 +; W64-O0-NEXT: v_readlane_b32 s4, v1, 15 +; W64-O0-NEXT: v_readlane_b32 s5, v1, 16 +; W64-O0-NEXT: v_readlane_b32 s8, v1, 11 +; W64-O0-NEXT: v_readlane_b32 s9, v1, 12 +; W64-O0-NEXT: v_readlane_b32 s10, v1, 13 +; W64-O0-NEXT: v_readlane_b32 s11, v1, 14 +; W64-O0-NEXT: v_readlane_b32 s6, v1, 0 +; W64-O0-NEXT: s_nop 4 ; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] ; W64-O0-NEXT: s_cbranch_execnz .LBB1_4 ; W64-O0-NEXT: ; %bb.6: -; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; W64-O0-NEXT: v_readlane_b32 s4, v13, 9 -; W64-O0-NEXT: v_readlane_b32 s5, v13, 10 +; W64-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; W64-O0-NEXT: s_waitcnt vmcnt(6) +; W64-O0-NEXT: v_readlane_b32 s4, v0, 9 +; W64-O0-NEXT: v_readlane_b32 s5, v0, 10 ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: global_store_dword v[3:4], v5, off +; W64-O0-NEXT: global_store_dword v[4:5], v6, off ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: global_store_dword v[0:1], v2, off +; W64-O0-NEXT: global_store_dword v[1:2], v3, off ; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: ; kill: killed $vgpr0 ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; W64-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: s_setpc_b64 s[30:31] @@ -989,239 +1055,288 @@ ; W64-O0: ; %bb.0: ; %entry ; W64-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; W64-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] -; W64-O0-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; W64-O0-NEXT: v_mov_b32_e32 v5, v4 -; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; W64-O0-NEXT: ; implicit-def: $vgpr8 +; W64-O0-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; W64-O0-NEXT: v_mov_b32_e32 v6, v5 +; W64-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; W64-O0-NEXT: s_nop 0 -; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; W64-O0-NEXT: v_mov_b32_e32 v10, v2 +; W64-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; W64-O0-NEXT: v_mov_b32_e32 v4, v3 +; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; W64-O0-NEXT: v_mov_b32_e32 v13, v2 ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; W64-O0-NEXT: v_mov_b32_e32 v6, v1 +; W64-O0-NEXT: v_mov_b32_e32 v10, v1 ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; W64-O0-NEXT: v_mov_b32_e32 v9, v0 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; W64-O0-NEXT: v_mov_b32_e32 v8, v0 +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v11, v3 -; W64-O0-NEXT: v_mov_b32_e32 v3, v11 -; W64-O0-NEXT: v_mov_b32_e32 v5, v10 +; W64-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v14, v4 +; W64-O0-NEXT: v_mov_b32_e32 v4, v14 +; W64-O0-NEXT: v_mov_b32_e32 v6, v13 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v10, v6 -; W64-O0-NEXT: v_mov_b32_e32 v6, v10 +; W64-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v9, v10 ; W64-O0-NEXT: v_mov_b32_e32 v13, v9 +; W64-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 killed $vgpr8_vgpr9 killed $exec ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14_vgpr15_vgpr16 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v14, v6 -; W64-O0-NEXT: v_mov_b32_e32 v15, v5 -; W64-O0-NEXT: v_mov_b32_e32 v16, v3 -; W64-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; W64-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v9, v13 +; W64-O0-NEXT: v_mov_b32_e32 v10, v6 +; W64-O0-NEXT: v_mov_b32_e32 v11, v4 +; W64-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v5, v7 +; W64-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v6, v7 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v3, v1 +; W64-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v4, v2 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v1, v12 -; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v2, v12 ; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; W64-O0-NEXT: ;;#ASMSTART ; W64-O0-NEXT: s_mov_b32 s4, 17 ; W64-O0-NEXT: ;;#ASMEND ; W64-O0-NEXT: s_mov_b32 s5, s4 -; W64-O0-NEXT: v_writelane_b32 v8, s5, 0 +; W64-O0-NEXT: v_writelane_b32 v0, s5, 0 ; W64-O0-NEXT: s_mov_b32 s5, 0 -; W64-O0-NEXT: v_writelane_b32 v8, s5, 1 -; W64-O0-NEXT: v_mov_b32_e32 v0, s4 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: v_writelane_b32 v0, s5, 1 +; W64-O0-NEXT: v_mov_b32_e32 v1, s4 +; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 s[4:5], exec -; W64-O0-NEXT: v_writelane_b32 v8, s4, 2 -; W64-O0-NEXT: v_writelane_b32 v8, s5, 3 +; W64-O0-NEXT: v_writelane_b32 v0, s4, 2 +; W64-O0-NEXT: v_writelane_b32 v0, s5, 3 +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readfirstlane_b32 s8, v0 -; W64-O0-NEXT: v_readfirstlane_b32 s12, v1 +; W64-O0-NEXT: v_readfirstlane_b32 s8, v1 +; W64-O0-NEXT: v_readfirstlane_b32 s12, v2 ; W64-O0-NEXT: s_mov_b32 s4, s8 ; W64-O0-NEXT: s_mov_b32 s5, s12 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1] -; W64-O0-NEXT: v_readfirstlane_b32 s7, v2 -; W64-O0-NEXT: v_readfirstlane_b32 s6, v3 +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[1:2] +; W64-O0-NEXT: v_readfirstlane_b32 s7, v3 +; W64-O0-NEXT: v_readfirstlane_b32 s6, v4 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3] +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[3:4] ; W64-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[10:11] ; W64-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11 ; W64-O0-NEXT: s_mov_b32 s9, s12 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_writelane_b32 v8, s8, 4 -; W64-O0-NEXT: v_writelane_b32 v8, s9, 5 -; W64-O0-NEXT: v_writelane_b32 v8, s10, 6 -; W64-O0-NEXT: v_writelane_b32 v8, s11, 7 +; W64-O0-NEXT: v_writelane_b32 v0, s8, 4 +; W64-O0-NEXT: v_writelane_b32 v0, s9, 5 +; W64-O0-NEXT: v_writelane_b32 v0, s10, 6 +; W64-O0-NEXT: v_writelane_b32 v0, s11, 7 ; W64-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; W64-O0-NEXT: v_writelane_b32 v8, s4, 8 -; W64-O0-NEXT: v_writelane_b32 v8, s5, 9 +; W64-O0-NEXT: v_writelane_b32 v0, s4, 8 +; W64-O0-NEXT: v_writelane_b32 v0, s5, 9 +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: ; %bb.2: ; in Loop: Header=BB2_1 Depth=1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; W64-O0-NEXT: v_readlane_b32 s4, v8, 8 -; W64-O0-NEXT: v_readlane_b32 s5, v8, 9 -; W64-O0-NEXT: v_readlane_b32 s8, v8, 4 -; W64-O0-NEXT: v_readlane_b32 s9, v8, 5 -; W64-O0-NEXT: v_readlane_b32 s10, v8, 6 -; W64-O0-NEXT: v_readlane_b32 s11, v8, 7 -; W64-O0-NEXT: v_readlane_b32 s6, v8, 1 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: s_nop 3 +; W64-O0-NEXT: v_readlane_b32 s4, v1, 8 +; W64-O0-NEXT: v_readlane_b32 s5, v1, 9 +; W64-O0-NEXT: v_readlane_b32 s8, v1, 4 +; W64-O0-NEXT: v_readlane_b32 s9, v1, 5 +; W64-O0-NEXT: v_readlane_b32 s10, v1, 6 +; W64-O0-NEXT: v_readlane_b32 s11, v1, 7 +; W64-O0-NEXT: v_readlane_b32 s6, v1, 1 +; W64-O0-NEXT: s_nop 4 ; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] ; W64-O0-NEXT: s_cbranch_execnz .LBB2_1 ; W64-O0-NEXT: ; %bb.3: -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; W64-O0-NEXT: v_readlane_b32 s6, v8, 2 -; W64-O0-NEXT: v_readlane_b32 s7, v8, 3 +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; W64-O0-NEXT: s_waitcnt vmcnt(2) +; W64-O0-NEXT: v_readlane_b32 s6, v0, 2 +; W64-O0-NEXT: v_readlane_b32 s7, v0, 3 ; W64-O0-NEXT: s_mov_b64 exec, s[6:7] -; W64-O0-NEXT: v_readlane_b32 s4, v8, 1 +; W64-O0-NEXT: v_readlane_b32 s4, v0, 1 ; W64-O0-NEXT: s_mov_b32 s5, 0x3ff ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_and_b32_e64 v1, v1, s5 -; W64-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v1, s4 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; W64-O0-NEXT: v_and_b32_e64 v2, v2, s5 +; W64-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v2, s4 +; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 s[4:5], exec -; W64-O0-NEXT: v_writelane_b32 v8, s4, 10 -; W64-O0-NEXT: v_writelane_b32 v8, s5, 11 +; W64-O0-NEXT: v_writelane_b32 v0, s4, 10 +; W64-O0-NEXT: v_writelane_b32 v0, s5, 11 +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: s_cbranch_execz .LBB2_8 ; W64-O0-NEXT: ; %bb.4: ; %bb1 -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; W64-O0-NEXT: v_readlane_b32 s4, v8, 0 +; W64-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; W64-O0-NEXT: s_waitcnt vmcnt(4) +; W64-O0-NEXT: v_readlane_b32 s4, v0, 0 ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_mov_b32_e32 v6, v4 -; W64-O0-NEXT: v_mov_b32_e32 v0, v3 -; W64-O0-NEXT: v_mov_b32_e32 v4, v2 -; W64-O0-NEXT: v_mov_b32_e32 v5, v1 +; W64-O0-NEXT: v_mov_b32_e32 v7, v5 +; W64-O0-NEXT: v_mov_b32_e32 v1, v4 +; W64-O0-NEXT: v_mov_b32_e32 v5, v3 +; W64-O0-NEXT: v_mov_b32_e32 v6, v2 ; W64-O0-NEXT: ; implicit-def: $sgpr5 ; W64-O0-NEXT: ; implicit-def: $sgpr5 ; W64-O0-NEXT: ; implicit-def: $sgpr5 ; W64-O0-NEXT: ; implicit-def: $sgpr5 -; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v1, v6 -; W64-O0-NEXT: v_mov_b32_e32 v2, v5 -; W64-O0-NEXT: v_mov_b32_e32 v3, v4 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v2, v7 +; W64-O0-NEXT: v_mov_b32_e32 v3, v6 +; W64-O0-NEXT: v_mov_b32_e32 v4, v5 ; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b32 s5, 0 -; W64-O0-NEXT: v_writelane_b32 v8, s5, 12 -; W64-O0-NEXT: v_mov_b32_e32 v0, s4 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; W64-O0-NEXT: v_writelane_b32 v0, s5, 12 +; W64-O0-NEXT: v_mov_b32_e32 v1, s4 +; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 s[4:5], exec -; W64-O0-NEXT: v_writelane_b32 v8, s4, 13 -; W64-O0-NEXT: v_writelane_b32 v8, s5, 14 +; W64-O0-NEXT: v_writelane_b32 v0, s4, 13 +; W64-O0-NEXT: v_writelane_b32 v0, s5, 14 +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: .LBB2_5: ; =>This Inner Loop Header: Depth=1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readfirstlane_b32 s8, v0 -; W64-O0-NEXT: v_readfirstlane_b32 s12, v1 +; W64-O0-NEXT: v_readfirstlane_b32 s8, v1 +; W64-O0-NEXT: v_readfirstlane_b32 s12, v2 ; W64-O0-NEXT: s_mov_b32 s4, s8 ; W64-O0-NEXT: s_mov_b32 s5, s12 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1] -; W64-O0-NEXT: v_readfirstlane_b32 s7, v2 -; W64-O0-NEXT: v_readfirstlane_b32 s6, v3 +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[1:2] +; W64-O0-NEXT: v_readfirstlane_b32 s7, v3 +; W64-O0-NEXT: v_readfirstlane_b32 s6, v4 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3] +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[3:4] ; W64-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[10:11] ; W64-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11 ; W64-O0-NEXT: s_mov_b32 s9, s12 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_writelane_b32 v8, s8, 15 -; W64-O0-NEXT: v_writelane_b32 v8, s9, 16 -; W64-O0-NEXT: v_writelane_b32 v8, s10, 17 -; W64-O0-NEXT: v_writelane_b32 v8, s11, 18 +; W64-O0-NEXT: v_writelane_b32 v0, s8, 15 +; W64-O0-NEXT: v_writelane_b32 v0, s9, 16 +; W64-O0-NEXT: v_writelane_b32 v0, s10, 17 +; W64-O0-NEXT: v_writelane_b32 v0, s11, 18 ; W64-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; W64-O0-NEXT: v_writelane_b32 v8, s4, 19 -; W64-O0-NEXT: v_writelane_b32 v8, s5, 20 +; W64-O0-NEXT: v_writelane_b32 v0, s4, 19 +; W64-O0-NEXT: v_writelane_b32 v0, s5, 20 +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: ; %bb.6: ; in Loop: Header=BB2_5 Depth=1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; W64-O0-NEXT: v_readlane_b32 s4, v8, 19 -; W64-O0-NEXT: v_readlane_b32 s5, v8, 20 -; W64-O0-NEXT: v_readlane_b32 s8, v8, 15 -; W64-O0-NEXT: v_readlane_b32 s9, v8, 16 -; W64-O0-NEXT: v_readlane_b32 s10, v8, 17 -; W64-O0-NEXT: v_readlane_b32 s11, v8, 18 -; W64-O0-NEXT: v_readlane_b32 s6, v8, 12 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: s_nop 3 +; W64-O0-NEXT: v_readlane_b32 s4, v1, 19 +; W64-O0-NEXT: v_readlane_b32 s5, v1, 20 +; W64-O0-NEXT: v_readlane_b32 s8, v1, 15 +; W64-O0-NEXT: v_readlane_b32 s9, v1, 16 +; W64-O0-NEXT: v_readlane_b32 s10, v1, 17 +; W64-O0-NEXT: v_readlane_b32 s11, v1, 18 +; W64-O0-NEXT: v_readlane_b32 s6, v1, 12 +; W64-O0-NEXT: s_nop 4 ; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] ; W64-O0-NEXT: s_cbranch_execnz .LBB2_5 ; W64-O0-NEXT: ; %bb.7: -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; W64-O0-NEXT: v_readlane_b32 s4, v8, 13 -; W64-O0-NEXT: v_readlane_b32 s5, v8, 14 -; W64-O0-NEXT: s_mov_b64 exec, s[4:5] +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; W64-O0-NEXT: v_readlane_b32 s4, v1, 13 +; W64-O0-NEXT: v_readlane_b32 s5, v1, 14 +; W64-O0-NEXT: s_mov_b64 exec, s[4:5] +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; W64-O0-NEXT: .LBB2_8: ; %bb2 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; W64-O0-NEXT: s_nop 0 +; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; W64-O0-NEXT: v_readlane_b32 s4, v8, 10 -; W64-O0-NEXT: v_readlane_b32 s5, v8, 11 +; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; W64-O0-NEXT: s_waitcnt vmcnt(3) +; W64-O0-NEXT: v_readlane_b32 s4, v0, 10 +; W64-O0-NEXT: v_readlane_b32 s5, v0, 11 ; W64-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: global_store_dword v[0:1], v2, off +; W64-O0-NEXT: global_store_dword v[1:2], v3, off ; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: ; kill: killed $vgpr0 ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; W64-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll --- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll +++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll @@ -192,13 +192,12 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: v_writelane_b32 v40, s4, 5 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_addk_i32 s32, 0x800 ; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: v_writelane_b32 v44, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s36, 3 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4 @@ -230,10 +229,9 @@ ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s4, v44, 0 +; GFX9-NEXT: v_readlane_b32 s4, v40, 5 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: s_addk_i32 s32, 0xf800 ; GFX9-NEXT: s_mov_b32 s33, s4 diff --git a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll --- a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll @@ -27,7 +27,7 @@ ; CHECK-LABEL: csr_vgpr_spill_fp_callee: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s24, s33 +; CHECK-NEXT: s_mov_b32 s18, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_xor_saveexec_b64 s[16:17], -1 ; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill @@ -54,7 +54,7 @@ ; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 -; CHECK-NEXT: s_mov_b32 s33, s24 +; CHECK-NEXT: s_mov_b32 s33, s18 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] bb: @@ -66,12 +66,16 @@ define amdgpu_kernel void @kernel_call() { ; CHECK-LABEL: kernel_call: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_mov_b32 s32, 0 +; CHECK-NEXT: s_mov_b32 s32, 0x400 ; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; CHECK-NEXT: s_add_u32 s0, s0, s17 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-NEXT: ; implicit-def: $vgpr3 ; CHECK-NEXT: v_writelane_b32 v3, s16, 0 +; CHECK-NEXT: s_or_saveexec_b64 s[24:25], -1 +; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[24:25] ; CHECK-NEXT: s_mov_b32 s13, s15 ; CHECK-NEXT: s_mov_b32 s12, s14 ; CHECK-NEXT: v_readlane_b32 s14, v3, 0 @@ -89,6 +93,10 @@ ; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21] ; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] +; CHECK-NEXT: s_or_saveexec_b64 s[24:25], -1 +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[24:25] +; CHECK-NEXT: ; kill: killed $vgpr0 ; CHECK-NEXT: s_endpgm bb: tail call fastcc void @csr_vgpr_spill_fp_callee() @@ -113,9 +121,9 @@ ; CHECK-NEXT: s_addc_u32 s17, s17, callee_has_fp@rel32@hi+12 ; CHECK-NEXT: v_readlane_b32 s33, v1, 0 ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; CHECK-NEXT: s_xor_saveexec_b64 s[20:21], -1 +; CHECK-NEXT: s_xor_saveexec_b64 s[18:19], -1 ; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b64 exec, s[20:21] +; CHECK-NEXT: s_mov_b64 exec, s[18:19] ; CHECK-NEXT: s_setpc_b64 s[16:17] bb: call void asm sideeffect "; clobber csr v40", "~{v40}"() @@ -126,12 +134,16 @@ define amdgpu_kernel void @kernel_tailcall() { ; CHECK-LABEL: kernel_tailcall: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_mov_b32 s32, 0 +; CHECK-NEXT: s_mov_b32 s32, 0x400 ; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; CHECK-NEXT: s_add_u32 s0, s0, s17 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-NEXT: ; implicit-def: $vgpr3 ; CHECK-NEXT: v_writelane_b32 v3, s16, 0 +; CHECK-NEXT: s_or_saveexec_b64 s[24:25], -1 +; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[24:25] ; CHECK-NEXT: s_mov_b32 s13, s15 ; CHECK-NEXT: s_mov_b32 s12, s14 ; CHECK-NEXT: v_readlane_b32 s14, v3, 0 @@ -149,6 +161,10 @@ ; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21] ; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] +; CHECK-NEXT: s_or_saveexec_b64 s[24:25], -1 +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[24:25] +; CHECK-NEXT: ; kill: killed $vgpr0 ; CHECK-NEXT: s_endpgm bb: tail call fastcc void @csr_vgpr_spill_fp_tailcall_callee() @@ -172,7 +188,7 @@ ; CHECK-LABEL: caller_save_vgpr_spill_fp_tail_call: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s24, s33 +; CHECK-NEXT: s_mov_b32 s18, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_xor_saveexec_b64 s[16:17], -1 ; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill @@ -194,7 +210,7 @@ ; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 -; CHECK-NEXT: s_mov_b32 s33, s24 +; CHECK-NEXT: s_mov_b32 s33, s18 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -206,7 +222,7 @@ ; CHECK-LABEL: caller_save_vgpr_spill_fp: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s25, s33 +; CHECK-NEXT: s_mov_b32 s19, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_xor_saveexec_b64 s[16:17], -1 ; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill @@ -228,7 +244,7 @@ ; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 -; CHECK-NEXT: s_mov_b32 s33, s25 +; CHECK-NEXT: s_mov_b32 s33, s19 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -239,12 +255,16 @@ define protected amdgpu_kernel void @kernel() { ; CHECK-LABEL: kernel: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_mov_b32 s32, 0 +; CHECK-NEXT: s_mov_b32 s32, 0x400 ; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; CHECK-NEXT: s_add_u32 s0, s0, s17 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-NEXT: ; implicit-def: $vgpr3 ; CHECK-NEXT: v_writelane_b32 v3, s16, 0 +; CHECK-NEXT: s_or_saveexec_b64 s[24:25], -1 +; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[24:25] ; CHECK-NEXT: s_mov_b32 s13, s15 ; CHECK-NEXT: s_mov_b32 s12, s14 ; CHECK-NEXT: v_readlane_b32 s14, v3, 0 @@ -262,6 +282,11 @@ ; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21] ; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] +; CHECK-NEXT: ; kill: def $vgpr1 killed $vgpr0 killed $exec +; CHECK-NEXT: s_or_saveexec_b64 s[24:25], -1 +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[24:25] +; CHECK-NEXT: ; kill: killed $vgpr0 ; CHECK-NEXT: s_endpgm entry: %call = call i32 @caller_save_vgpr_spill_fp() diff --git a/llvm/test/CodeGen/AMDGPU/nested-calls.ll b/llvm/test/CodeGen/AMDGPU/nested-calls.ll --- a/llvm/test/CodeGen/AMDGPU/nested-calls.ll +++ b/llvm/test/CodeGen/AMDGPU/nested-calls.ll @@ -15,9 +15,8 @@ ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN-DAG: v_writelane_b32 v41, [[FP_SCRATCH_COPY]], 0 +; GCN-DAG: v_writelane_b32 v40, [[FP_SCRATCH_COPY]], 2 ; GCN-DAG: v_writelane_b32 v40, s30, 0 ; GCN-DAG: v_writelane_b32 v40, s31, 1 @@ -26,10 +25,9 @@ ; GCN: v_readlane_b32 s31, v40, 1 ; GCN: v_readlane_b32 s30, v40, 0 -; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v41, 0 +; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v40, 2 ; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, [[FP_SCRATCH_COPY]] diff --git a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll --- a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll +++ b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll @@ -17,9 +17,8 @@ ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[18:19] -; CHECK-NEXT: v_writelane_b32 v41, s16, 0 +; CHECK-NEXT: v_writelane_b32 v40, s16, 2 ; CHECK-NEXT: s_add_i32 s32, s32, 0x400 ; CHECK-NEXT: v_writelane_b32 v40, s30, 0 ; CHECK-NEXT: v_writelane_b32 v40, s31, 1 @@ -39,12 +38,11 @@ ; CHECK-NEXT: .loc 0 32 1 ; lane-info.cpp:32:1 ; CHECK-NEXT: v_readlane_b32 s31, v40, 1 ; CHECK-NEXT: v_readlane_b32 s30, v40, 0 -; CHECK-NEXT: v_readlane_b32 s4, v41, 0 +; CHECK-NEXT: v_readlane_b32 s4, v40, 2 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[6:7] -; CHECK-NEXT: .loc 0 32 1 epilogue_begin is_stmt 0 ; lane-info.cpp:32:1 +; CHECK-NEXT: .loc 0 32 1 epilogue_begin is_stmt 0 ; lane-info.cpp:32:1 ; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 ; CHECK-NEXT: s_mov_b32 s33, s4 ; CHECK-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll --- a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll @@ -11,95 +11,107 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out, i32 %in) #0 { ; GCN-LABEL: spill_sgprs_to_multiple_vgprs: ; GCN: ; %bb.0: +; GCN-NEXT: s_mov_b32 s92, SCRATCH_RSRC_DWORD0 +; GCN-NEXT: s_mov_b32 s93, SCRATCH_RSRC_DWORD1 +; GCN-NEXT: s_mov_b32 s94, -1 +; GCN-NEXT: s_mov_b32 s95, 0xe8f000 +; GCN-NEXT: s_add_u32 s92, s92, s11 +; GCN-NEXT: s_addc_u32 s93, s93, 0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 ; GCN-NEXT: s_load_dword s0, s[4:5], 0xb ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v0, s4, 0 -; GCN-NEXT: v_writelane_b32 v0, s5, 1 -; GCN-NEXT: v_writelane_b32 v0, s6, 2 -; GCN-NEXT: v_writelane_b32 v0, s7, 3 -; GCN-NEXT: v_writelane_b32 v0, s8, 4 -; GCN-NEXT: v_writelane_b32 v0, s9, 5 -; GCN-NEXT: v_writelane_b32 v0, s10, 6 -; GCN-NEXT: v_writelane_b32 v0, s11, 7 +; GCN-NEXT: v_writelane_b32 v2, s4, 0 +; GCN-NEXT: v_writelane_b32 v2, s5, 1 +; GCN-NEXT: v_writelane_b32 v2, s6, 2 +; GCN-NEXT: v_writelane_b32 v2, s7, 3 +; GCN-NEXT: v_writelane_b32 v2, s8, 4 +; GCN-NEXT: v_writelane_b32 v2, s9, 5 +; GCN-NEXT: v_writelane_b32 v2, s10, 6 +; GCN-NEXT: v_writelane_b32 v2, s11, 7 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v0, s4, 8 -; GCN-NEXT: v_writelane_b32 v0, s5, 9 -; GCN-NEXT: v_writelane_b32 v0, s6, 10 -; GCN-NEXT: v_writelane_b32 v0, s7, 11 -; GCN-NEXT: v_writelane_b32 v0, s8, 12 -; GCN-NEXT: v_writelane_b32 v0, s9, 13 -; GCN-NEXT: v_writelane_b32 v0, s10, 14 -; GCN-NEXT: v_writelane_b32 v0, s11, 15 +; GCN-NEXT: v_writelane_b32 v2, s4, 8 +; GCN-NEXT: v_writelane_b32 v2, s5, 9 +; GCN-NEXT: v_writelane_b32 v2, s6, 10 +; GCN-NEXT: v_writelane_b32 v2, s7, 11 +; GCN-NEXT: v_writelane_b32 v2, s8, 12 +; GCN-NEXT: v_writelane_b32 v2, s9, 13 +; GCN-NEXT: v_writelane_b32 v2, s10, 14 +; GCN-NEXT: v_writelane_b32 v2, s11, 15 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v0, s4, 16 -; GCN-NEXT: v_writelane_b32 v0, s5, 17 -; GCN-NEXT: v_writelane_b32 v0, s6, 18 -; GCN-NEXT: v_writelane_b32 v0, s7, 19 -; GCN-NEXT: v_writelane_b32 v0, s8, 20 -; GCN-NEXT: v_writelane_b32 v0, s9, 21 -; GCN-NEXT: v_writelane_b32 v0, s10, 22 -; GCN-NEXT: v_writelane_b32 v0, s11, 23 +; GCN-NEXT: v_writelane_b32 v2, s4, 16 +; GCN-NEXT: v_writelane_b32 v2, s5, 17 +; GCN-NEXT: v_writelane_b32 v2, s6, 18 +; GCN-NEXT: v_writelane_b32 v2, s7, 19 +; GCN-NEXT: v_writelane_b32 v2, s8, 20 +; GCN-NEXT: v_writelane_b32 v2, s9, 21 +; GCN-NEXT: v_writelane_b32 v2, s10, 22 +; GCN-NEXT: v_writelane_b32 v2, s11, 23 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v0, s4, 24 -; GCN-NEXT: v_writelane_b32 v0, s5, 25 -; GCN-NEXT: v_writelane_b32 v0, s6, 26 -; GCN-NEXT: v_writelane_b32 v0, s7, 27 -; GCN-NEXT: v_writelane_b32 v0, s8, 28 -; GCN-NEXT: v_writelane_b32 v0, s9, 29 -; GCN-NEXT: v_writelane_b32 v0, s10, 30 -; GCN-NEXT: v_writelane_b32 v0, s11, 31 +; GCN-NEXT: v_writelane_b32 v2, s4, 24 +; GCN-NEXT: v_writelane_b32 v2, s5, 25 +; GCN-NEXT: v_writelane_b32 v2, s6, 26 +; GCN-NEXT: v_writelane_b32 v2, s7, 27 +; GCN-NEXT: v_writelane_b32 v2, s8, 28 +; GCN-NEXT: v_writelane_b32 v2, s9, 29 +; GCN-NEXT: v_writelane_b32 v2, s10, 30 +; GCN-NEXT: v_writelane_b32 v2, s11, 31 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v0, s4, 32 -; GCN-NEXT: v_writelane_b32 v0, s5, 33 -; GCN-NEXT: v_writelane_b32 v0, s6, 34 -; GCN-NEXT: v_writelane_b32 v0, s7, 35 -; GCN-NEXT: v_writelane_b32 v0, s8, 36 -; GCN-NEXT: v_writelane_b32 v0, s9, 37 -; GCN-NEXT: v_writelane_b32 v0, s10, 38 -; GCN-NEXT: v_writelane_b32 v0, s11, 39 +; GCN-NEXT: v_writelane_b32 v2, s4, 32 +; GCN-NEXT: v_writelane_b32 v2, s5, 33 +; GCN-NEXT: v_writelane_b32 v2, s6, 34 +; GCN-NEXT: v_writelane_b32 v2, s7, 35 +; GCN-NEXT: v_writelane_b32 v2, s8, 36 +; GCN-NEXT: v_writelane_b32 v2, s9, 37 +; GCN-NEXT: v_writelane_b32 v2, s10, 38 +; GCN-NEXT: v_writelane_b32 v2, s11, 39 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v0, s4, 40 -; GCN-NEXT: v_writelane_b32 v0, s5, 41 -; GCN-NEXT: v_writelane_b32 v0, s6, 42 -; GCN-NEXT: v_writelane_b32 v0, s7, 43 -; GCN-NEXT: v_writelane_b32 v0, s8, 44 -; GCN-NEXT: v_writelane_b32 v0, s9, 45 -; GCN-NEXT: v_writelane_b32 v0, s10, 46 -; GCN-NEXT: v_writelane_b32 v0, s11, 47 +; GCN-NEXT: v_writelane_b32 v2, s4, 40 +; GCN-NEXT: v_writelane_b32 v2, s5, 41 +; GCN-NEXT: v_writelane_b32 v2, s6, 42 +; GCN-NEXT: v_writelane_b32 v2, s7, 43 +; GCN-NEXT: v_writelane_b32 v2, s8, 44 +; GCN-NEXT: v_writelane_b32 v2, s9, 45 +; GCN-NEXT: v_writelane_b32 v2, s10, 46 +; GCN-NEXT: v_writelane_b32 v2, s11, 47 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v0, s4, 48 -; GCN-NEXT: v_writelane_b32 v0, s5, 49 -; GCN-NEXT: v_writelane_b32 v0, s6, 50 -; GCN-NEXT: v_writelane_b32 v0, s7, 51 -; GCN-NEXT: v_writelane_b32 v0, s8, 52 -; GCN-NEXT: v_writelane_b32 v0, s9, 53 -; GCN-NEXT: v_writelane_b32 v0, s10, 54 -; GCN-NEXT: v_writelane_b32 v0, s11, 55 +; GCN-NEXT: v_writelane_b32 v2, s4, 48 +; GCN-NEXT: v_writelane_b32 v2, s5, 49 +; GCN-NEXT: v_writelane_b32 v2, s6, 50 +; GCN-NEXT: v_writelane_b32 v2, s7, 51 +; GCN-NEXT: v_writelane_b32 v2, s8, 52 +; GCN-NEXT: v_writelane_b32 v2, s9, 53 +; GCN-NEXT: v_writelane_b32 v2, s10, 54 +; GCN-NEXT: v_writelane_b32 v2, s11, 55 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v0, s4, 56 -; GCN-NEXT: v_writelane_b32 v0, s5, 57 -; GCN-NEXT: v_writelane_b32 v0, s6, 58 -; GCN-NEXT: v_writelane_b32 v0, s7, 59 -; GCN-NEXT: v_writelane_b32 v0, s8, 60 -; GCN-NEXT: v_writelane_b32 v0, s9, 61 -; GCN-NEXT: v_writelane_b32 v0, s10, 62 -; GCN-NEXT: v_writelane_b32 v0, s11, 63 +; GCN-NEXT: v_writelane_b32 v2, s4, 56 +; GCN-NEXT: v_writelane_b32 v2, s5, 57 +; GCN-NEXT: v_writelane_b32 v2, s6, 58 +; GCN-NEXT: v_writelane_b32 v2, s7, 59 +; GCN-NEXT: v_writelane_b32 v2, s8, 60 +; GCN-NEXT: v_writelane_b32 v2, s9, 61 +; GCN-NEXT: v_writelane_b32 v2, s10, 62 +; GCN-NEXT: v_writelane_b32 v2, s11, 63 +; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GCN-NEXT: buffer_store_dword v2, off, s[92:95], 0 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND @@ -188,176 +200,192 @@ ; GCN-NEXT: v_writelane_b32 v1, s9, 61 ; GCN-NEXT: v_writelane_b32 v1, s10, 62 ; GCN-NEXT: v_writelane_b32 v1, s11, 63 +; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GCN-NEXT: buffer_store_dword v1, off, s[92:95], 0 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v2, s4, 0 -; GCN-NEXT: v_writelane_b32 v2, s5, 1 -; GCN-NEXT: v_writelane_b32 v2, s6, 2 -; GCN-NEXT: v_writelane_b32 v2, s7, 3 -; GCN-NEXT: v_writelane_b32 v2, s8, 4 -; GCN-NEXT: v_writelane_b32 v2, s9, 5 -; GCN-NEXT: v_writelane_b32 v2, s10, 6 -; GCN-NEXT: v_writelane_b32 v2, s11, 7 +; GCN-NEXT: v_writelane_b32 v0, s4, 0 +; GCN-NEXT: v_writelane_b32 v0, s5, 1 +; GCN-NEXT: v_writelane_b32 v0, s6, 2 +; GCN-NEXT: v_writelane_b32 v0, s7, 3 +; GCN-NEXT: v_writelane_b32 v0, s8, 4 +; GCN-NEXT: v_writelane_b32 v0, s9, 5 +; GCN-NEXT: v_writelane_b32 v0, s10, 6 +; GCN-NEXT: v_writelane_b32 v0, s11, 7 +; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GCN-NEXT: buffer_store_dword v0, off, s[92:95], 0 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s0, s1 ; GCN-NEXT: s_cbranch_scc1 .LBB0_2 ; GCN-NEXT: ; %bb.1: ; %bb0 -; GCN-NEXT: v_readlane_b32 s8, v1, 56 -; GCN-NEXT: v_readlane_b32 s9, v1, 57 -; GCN-NEXT: v_readlane_b32 s10, v1, 58 -; GCN-NEXT: v_readlane_b32 s11, v1, 59 -; GCN-NEXT: v_readlane_b32 s12, v1, 60 -; GCN-NEXT: v_readlane_b32 s13, v1, 61 -; GCN-NEXT: v_readlane_b32 s14, v1, 62 -; GCN-NEXT: v_readlane_b32 s15, v1, 63 -; GCN-NEXT: v_readlane_b32 s16, v1, 48 -; GCN-NEXT: v_readlane_b32 s17, v1, 49 -; GCN-NEXT: v_readlane_b32 s18, v1, 50 -; GCN-NEXT: v_readlane_b32 s19, v1, 51 -; GCN-NEXT: v_readlane_b32 s20, v1, 52 -; GCN-NEXT: v_readlane_b32 s21, v1, 53 -; GCN-NEXT: v_readlane_b32 s22, v1, 54 -; GCN-NEXT: v_readlane_b32 s23, v1, 55 -; GCN-NEXT: v_readlane_b32 s24, v1, 40 -; GCN-NEXT: v_readlane_b32 s25, v1, 41 -; GCN-NEXT: v_readlane_b32 s26, v1, 42 -; GCN-NEXT: v_readlane_b32 s27, v1, 43 -; GCN-NEXT: v_readlane_b32 s28, v1, 44 -; GCN-NEXT: v_readlane_b32 s29, v1, 45 -; GCN-NEXT: v_readlane_b32 s30, v1, 46 -; GCN-NEXT: v_readlane_b32 s31, v1, 47 -; GCN-NEXT: v_readlane_b32 s36, v1, 32 -; GCN-NEXT: v_readlane_b32 s37, v1, 33 -; GCN-NEXT: v_readlane_b32 s38, v1, 34 -; GCN-NEXT: v_readlane_b32 s39, v1, 35 -; GCN-NEXT: v_readlane_b32 s40, v1, 36 -; GCN-NEXT: v_readlane_b32 s41, v1, 37 -; GCN-NEXT: v_readlane_b32 s42, v1, 38 -; GCN-NEXT: v_readlane_b32 s43, v1, 39 -; GCN-NEXT: v_readlane_b32 s44, v1, 24 -; GCN-NEXT: v_readlane_b32 s45, v1, 25 -; GCN-NEXT: v_readlane_b32 s46, v1, 26 -; GCN-NEXT: v_readlane_b32 s47, v1, 27 -; GCN-NEXT: v_readlane_b32 s48, v1, 28 -; GCN-NEXT: v_readlane_b32 s49, v1, 29 -; GCN-NEXT: v_readlane_b32 s50, v1, 30 -; GCN-NEXT: v_readlane_b32 s51, v1, 31 -; GCN-NEXT: v_readlane_b32 s52, v1, 16 -; GCN-NEXT: v_readlane_b32 s53, v1, 17 -; GCN-NEXT: v_readlane_b32 s54, v1, 18 -; GCN-NEXT: v_readlane_b32 s55, v1, 19 -; GCN-NEXT: v_readlane_b32 s56, v1, 20 -; GCN-NEXT: v_readlane_b32 s57, v1, 21 -; GCN-NEXT: v_readlane_b32 s58, v1, 22 -; GCN-NEXT: v_readlane_b32 s59, v1, 23 -; GCN-NEXT: v_readlane_b32 s60, v1, 8 -; GCN-NEXT: v_readlane_b32 s61, v1, 9 -; GCN-NEXT: v_readlane_b32 s62, v1, 10 -; GCN-NEXT: v_readlane_b32 s63, v1, 11 -; GCN-NEXT: v_readlane_b32 s64, v1, 12 -; GCN-NEXT: v_readlane_b32 s65, v1, 13 -; GCN-NEXT: v_readlane_b32 s66, v1, 14 -; GCN-NEXT: v_readlane_b32 s67, v1, 15 -; GCN-NEXT: v_readlane_b32 s68, v1, 0 -; GCN-NEXT: v_readlane_b32 s69, v1, 1 -; GCN-NEXT: v_readlane_b32 s70, v1, 2 -; GCN-NEXT: v_readlane_b32 s71, v1, 3 -; GCN-NEXT: v_readlane_b32 s72, v1, 4 -; GCN-NEXT: v_readlane_b32 s73, v1, 5 -; GCN-NEXT: v_readlane_b32 s74, v1, 6 -; GCN-NEXT: v_readlane_b32 s75, v1, 7 -; GCN-NEXT: v_readlane_b32 s76, v0, 56 -; GCN-NEXT: v_readlane_b32 s77, v0, 57 -; GCN-NEXT: v_readlane_b32 s78, v0, 58 -; GCN-NEXT: v_readlane_b32 s79, v0, 59 -; GCN-NEXT: v_readlane_b32 s80, v0, 60 -; GCN-NEXT: v_readlane_b32 s81, v0, 61 -; GCN-NEXT: v_readlane_b32 s82, v0, 62 -; GCN-NEXT: v_readlane_b32 s83, v0, 63 -; GCN-NEXT: v_readlane_b32 s84, v0, 48 -; GCN-NEXT: v_readlane_b32 s85, v0, 49 -; GCN-NEXT: v_readlane_b32 s86, v0, 50 -; GCN-NEXT: v_readlane_b32 s87, v0, 51 -; GCN-NEXT: v_readlane_b32 s88, v0, 52 -; GCN-NEXT: v_readlane_b32 s89, v0, 53 -; GCN-NEXT: v_readlane_b32 s90, v0, 54 -; GCN-NEXT: v_readlane_b32 s91, v0, 55 -; GCN-NEXT: v_readlane_b32 s0, v0, 0 -; GCN-NEXT: v_readlane_b32 s1, v0, 1 -; GCN-NEXT: v_readlane_b32 s2, v0, 2 -; GCN-NEXT: v_readlane_b32 s3, v0, 3 -; GCN-NEXT: v_readlane_b32 s4, v0, 4 -; GCN-NEXT: v_readlane_b32 s5, v0, 5 -; GCN-NEXT: v_readlane_b32 s6, v0, 6 -; GCN-NEXT: v_readlane_b32 s7, v0, 7 +; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GCN-NEXT: buffer_load_dword v0, off, s[92:95], 0 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[34:35] +; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GCN-NEXT: buffer_load_dword v1, off, s[92:95], 0 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[34:35] +; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GCN-NEXT: buffer_load_dword v2, off, s[92:95], 0 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[34:35] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readlane_b32 s8, v2, 56 +; GCN-NEXT: v_readlane_b32 s9, v2, 57 +; GCN-NEXT: v_readlane_b32 s10, v2, 58 +; GCN-NEXT: v_readlane_b32 s11, v2, 59 +; GCN-NEXT: v_readlane_b32 s12, v2, 60 +; GCN-NEXT: v_readlane_b32 s13, v2, 61 +; GCN-NEXT: v_readlane_b32 s14, v2, 62 +; GCN-NEXT: v_readlane_b32 s15, v2, 63 +; GCN-NEXT: v_readlane_b32 s16, v2, 48 +; GCN-NEXT: v_readlane_b32 s17, v2, 49 +; GCN-NEXT: v_readlane_b32 s18, v2, 50 +; GCN-NEXT: v_readlane_b32 s19, v2, 51 +; GCN-NEXT: v_readlane_b32 s20, v2, 52 +; GCN-NEXT: v_readlane_b32 s21, v2, 53 +; GCN-NEXT: v_readlane_b32 s22, v2, 54 +; GCN-NEXT: v_readlane_b32 s23, v2, 55 +; GCN-NEXT: v_readlane_b32 s24, v2, 40 +; GCN-NEXT: v_readlane_b32 s25, v2, 41 +; GCN-NEXT: v_readlane_b32 s26, v2, 42 +; GCN-NEXT: v_readlane_b32 s27, v2, 43 +; GCN-NEXT: v_readlane_b32 s28, v2, 44 +; GCN-NEXT: v_readlane_b32 s29, v2, 45 +; GCN-NEXT: v_readlane_b32 s30, v2, 46 +; GCN-NEXT: v_readlane_b32 s31, v2, 47 +; GCN-NEXT: v_readlane_b32 s36, v2, 32 +; GCN-NEXT: v_readlane_b32 s37, v2, 33 +; GCN-NEXT: v_readlane_b32 s38, v2, 34 +; GCN-NEXT: v_readlane_b32 s39, v2, 35 +; GCN-NEXT: v_readlane_b32 s40, v2, 36 +; GCN-NEXT: v_readlane_b32 s41, v2, 37 +; GCN-NEXT: v_readlane_b32 s42, v2, 38 +; GCN-NEXT: v_readlane_b32 s43, v2, 39 +; GCN-NEXT: v_readlane_b32 s44, v2, 24 +; GCN-NEXT: v_readlane_b32 s45, v2, 25 +; GCN-NEXT: v_readlane_b32 s46, v2, 26 +; GCN-NEXT: v_readlane_b32 s47, v2, 27 +; GCN-NEXT: v_readlane_b32 s48, v2, 28 +; GCN-NEXT: v_readlane_b32 s49, v2, 29 +; GCN-NEXT: v_readlane_b32 s50, v2, 30 +; GCN-NEXT: v_readlane_b32 s51, v2, 31 +; GCN-NEXT: v_readlane_b32 s52, v2, 16 +; GCN-NEXT: v_readlane_b32 s53, v2, 17 +; GCN-NEXT: v_readlane_b32 s54, v2, 18 +; GCN-NEXT: v_readlane_b32 s55, v2, 19 +; GCN-NEXT: v_readlane_b32 s56, v2, 20 +; GCN-NEXT: v_readlane_b32 s57, v2, 21 +; GCN-NEXT: v_readlane_b32 s58, v2, 22 +; GCN-NEXT: v_readlane_b32 s59, v2, 23 +; GCN-NEXT: v_readlane_b32 s60, v2, 8 +; GCN-NEXT: v_readlane_b32 s61, v2, 9 +; GCN-NEXT: v_readlane_b32 s62, v2, 10 +; GCN-NEXT: v_readlane_b32 s63, v2, 11 +; GCN-NEXT: v_readlane_b32 s64, v2, 12 +; GCN-NEXT: v_readlane_b32 s65, v2, 13 +; GCN-NEXT: v_readlane_b32 s66, v2, 14 +; GCN-NEXT: v_readlane_b32 s67, v2, 15 +; GCN-NEXT: v_readlane_b32 s68, v2, 0 +; GCN-NEXT: v_readlane_b32 s69, v2, 1 +; GCN-NEXT: v_readlane_b32 s70, v2, 2 +; GCN-NEXT: v_readlane_b32 s71, v2, 3 +; GCN-NEXT: v_readlane_b32 s72, v2, 4 +; GCN-NEXT: v_readlane_b32 s73, v2, 5 +; GCN-NEXT: v_readlane_b32 s74, v2, 6 +; GCN-NEXT: v_readlane_b32 s75, v2, 7 +; GCN-NEXT: v_readlane_b32 s76, v1, 56 +; GCN-NEXT: v_readlane_b32 s77, v1, 57 +; GCN-NEXT: v_readlane_b32 s78, v1, 58 +; GCN-NEXT: v_readlane_b32 s79, v1, 59 +; GCN-NEXT: v_readlane_b32 s80, v1, 60 +; GCN-NEXT: v_readlane_b32 s81, v1, 61 +; GCN-NEXT: v_readlane_b32 s82, v1, 62 +; GCN-NEXT: v_readlane_b32 s83, v1, 63 +; GCN-NEXT: v_readlane_b32 s84, v1, 48 +; GCN-NEXT: v_readlane_b32 s85, v1, 49 +; GCN-NEXT: v_readlane_b32 s86, v1, 50 +; GCN-NEXT: v_readlane_b32 s87, v1, 51 +; GCN-NEXT: v_readlane_b32 s88, v1, 52 +; GCN-NEXT: v_readlane_b32 s89, v1, 53 +; GCN-NEXT: v_readlane_b32 s90, v1, 54 +; GCN-NEXT: v_readlane_b32 s91, v1, 55 +; GCN-NEXT: v_readlane_b32 s0, v1, 0 +; GCN-NEXT: v_readlane_b32 s1, v1, 1 +; GCN-NEXT: v_readlane_b32 s2, v1, 2 +; GCN-NEXT: v_readlane_b32 s3, v1, 3 +; GCN-NEXT: v_readlane_b32 s4, v1, 4 +; GCN-NEXT: v_readlane_b32 s5, v1, 5 +; GCN-NEXT: v_readlane_b32 s6, v1, 6 +; GCN-NEXT: v_readlane_b32 s7, v1, 7 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:7] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s0, v0, 8 -; GCN-NEXT: v_readlane_b32 s1, v0, 9 -; GCN-NEXT: v_readlane_b32 s2, v0, 10 -; GCN-NEXT: v_readlane_b32 s3, v0, 11 -; GCN-NEXT: v_readlane_b32 s4, v0, 12 -; GCN-NEXT: v_readlane_b32 s5, v0, 13 -; GCN-NEXT: v_readlane_b32 s6, v0, 14 -; GCN-NEXT: v_readlane_b32 s7, v0, 15 +; GCN-NEXT: v_readlane_b32 s0, v1, 8 +; GCN-NEXT: v_readlane_b32 s1, v1, 9 +; GCN-NEXT: v_readlane_b32 s2, v1, 10 +; GCN-NEXT: v_readlane_b32 s3, v1, 11 +; GCN-NEXT: v_readlane_b32 s4, v1, 12 +; GCN-NEXT: v_readlane_b32 s5, v1, 13 +; GCN-NEXT: v_readlane_b32 s6, v1, 14 +; GCN-NEXT: v_readlane_b32 s7, v1, 15 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:7] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s0, v0, 16 -; GCN-NEXT: v_readlane_b32 s1, v0, 17 -; GCN-NEXT: v_readlane_b32 s2, v0, 18 -; GCN-NEXT: v_readlane_b32 s3, v0, 19 -; GCN-NEXT: v_readlane_b32 s4, v0, 20 -; GCN-NEXT: v_readlane_b32 s5, v0, 21 -; GCN-NEXT: v_readlane_b32 s6, v0, 22 -; GCN-NEXT: v_readlane_b32 s7, v0, 23 +; GCN-NEXT: v_readlane_b32 s0, v1, 16 +; GCN-NEXT: v_readlane_b32 s1, v1, 17 +; GCN-NEXT: v_readlane_b32 s2, v1, 18 +; GCN-NEXT: v_readlane_b32 s3, v1, 19 +; GCN-NEXT: v_readlane_b32 s4, v1, 20 +; GCN-NEXT: v_readlane_b32 s5, v1, 21 +; GCN-NEXT: v_readlane_b32 s6, v1, 22 +; GCN-NEXT: v_readlane_b32 s7, v1, 23 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:7] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s0, v0, 24 -; GCN-NEXT: v_readlane_b32 s1, v0, 25 -; GCN-NEXT: v_readlane_b32 s2, v0, 26 -; GCN-NEXT: v_readlane_b32 s3, v0, 27 -; GCN-NEXT: v_readlane_b32 s4, v0, 28 -; GCN-NEXT: v_readlane_b32 s5, v0, 29 -; GCN-NEXT: v_readlane_b32 s6, v0, 30 -; GCN-NEXT: v_readlane_b32 s7, v0, 31 +; GCN-NEXT: v_readlane_b32 s0, v1, 24 +; GCN-NEXT: v_readlane_b32 s1, v1, 25 +; GCN-NEXT: v_readlane_b32 s2, v1, 26 +; GCN-NEXT: v_readlane_b32 s3, v1, 27 +; GCN-NEXT: v_readlane_b32 s4, v1, 28 +; GCN-NEXT: v_readlane_b32 s5, v1, 29 +; GCN-NEXT: v_readlane_b32 s6, v1, 30 +; GCN-NEXT: v_readlane_b32 s7, v1, 31 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:7] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s0, v0, 32 -; GCN-NEXT: v_readlane_b32 s1, v0, 33 -; GCN-NEXT: v_readlane_b32 s2, v0, 34 -; GCN-NEXT: v_readlane_b32 s3, v0, 35 -; GCN-NEXT: v_readlane_b32 s4, v0, 36 -; GCN-NEXT: v_readlane_b32 s5, v0, 37 -; GCN-NEXT: v_readlane_b32 s6, v0, 38 -; GCN-NEXT: v_readlane_b32 s7, v0, 39 +; GCN-NEXT: v_readlane_b32 s0, v1, 32 +; GCN-NEXT: v_readlane_b32 s1, v1, 33 +; GCN-NEXT: v_readlane_b32 s2, v1, 34 +; GCN-NEXT: v_readlane_b32 s3, v1, 35 +; GCN-NEXT: v_readlane_b32 s4, v1, 36 +; GCN-NEXT: v_readlane_b32 s5, v1, 37 +; GCN-NEXT: v_readlane_b32 s6, v1, 38 +; GCN-NEXT: v_readlane_b32 s7, v1, 39 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:7] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s0, v0, 40 -; GCN-NEXT: v_readlane_b32 s1, v0, 41 -; GCN-NEXT: v_readlane_b32 s2, v0, 42 -; GCN-NEXT: v_readlane_b32 s3, v0, 43 -; GCN-NEXT: v_readlane_b32 s4, v0, 44 -; GCN-NEXT: v_readlane_b32 s5, v0, 45 -; GCN-NEXT: v_readlane_b32 s6, v0, 46 -; GCN-NEXT: v_readlane_b32 s7, v0, 47 +; GCN-NEXT: v_readlane_b32 s0, v1, 40 +; GCN-NEXT: v_readlane_b32 s1, v1, 41 +; GCN-NEXT: v_readlane_b32 s2, v1, 42 +; GCN-NEXT: v_readlane_b32 s3, v1, 43 +; GCN-NEXT: v_readlane_b32 s4, v1, 44 +; GCN-NEXT: v_readlane_b32 s5, v1, 45 +; GCN-NEXT: v_readlane_b32 s6, v1, 46 +; GCN-NEXT: v_readlane_b32 s7, v1, 47 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:7] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s0, v2, 0 -; GCN-NEXT: v_readlane_b32 s1, v2, 1 -; GCN-NEXT: v_readlane_b32 s2, v2, 2 -; GCN-NEXT: v_readlane_b32 s3, v2, 3 -; GCN-NEXT: v_readlane_b32 s4, v2, 4 -; GCN-NEXT: v_readlane_b32 s5, v2, 5 -; GCN-NEXT: v_readlane_b32 s6, v2, 6 -; GCN-NEXT: v_readlane_b32 s7, v2, 7 +; GCN-NEXT: v_readlane_b32 s0, v0, 0 +; GCN-NEXT: v_readlane_b32 s1, v0, 1 +; GCN-NEXT: v_readlane_b32 s2, v0, 2 +; GCN-NEXT: v_readlane_b32 s3, v0, 3 +; GCN-NEXT: v_readlane_b32 s4, v0, 4 +; GCN-NEXT: v_readlane_b32 s5, v0, 5 +; GCN-NEXT: v_readlane_b32 s6, v0, 6 +; GCN-NEXT: v_readlane_b32 s7, v0, 7 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[84:91] ; GCN-NEXT: ;;#ASMEND @@ -392,6 +420,18 @@ ; GCN-NEXT: ; use s[0:7] ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: .LBB0_2: ; %ret +; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GCN-NEXT: buffer_load_dword v0, off, s[92:95], 0 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[34:35] +; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GCN-NEXT: buffer_load_dword v1, off, s[92:95], 0 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[34:35] +; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GCN-NEXT: buffer_load_dword v2, off, s[92:95], 0 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[34:35] +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr0 ; GCN-NEXT: s_endpgm %wide.sgpr0 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0 %wide.sgpr1 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0 @@ -442,104 +482,125 @@ define amdgpu_kernel void @split_sgpr_spill_2_vgprs(ptr addrspace(1) %out, i32 %in) #1 { ; GCN-LABEL: split_sgpr_spill_2_vgprs: ; GCN: ; %bb.0: +; GCN-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0 +; GCN-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1 +; GCN-NEXT: s_mov_b32 s54, -1 +; GCN-NEXT: s_mov_b32 s55, 0xe8f000 +; GCN-NEXT: s_add_u32 s52, s52, s11 +; GCN-NEXT: s_addc_u32 s53, s53, 0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 ; GCN-NEXT: s_load_dword s0, s[4:5], 0xb ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v0, s4, 0 -; GCN-NEXT: v_writelane_b32 v0, s5, 1 -; GCN-NEXT: v_writelane_b32 v0, s6, 2 -; GCN-NEXT: v_writelane_b32 v0, s7, 3 -; GCN-NEXT: v_writelane_b32 v0, s8, 4 -; GCN-NEXT: v_writelane_b32 v0, s9, 5 -; GCN-NEXT: v_writelane_b32 v0, s10, 6 -; GCN-NEXT: v_writelane_b32 v0, s11, 7 -; GCN-NEXT: v_writelane_b32 v0, s12, 8 -; GCN-NEXT: v_writelane_b32 v0, s13, 9 -; GCN-NEXT: v_writelane_b32 v0, s14, 10 -; GCN-NEXT: v_writelane_b32 v0, s15, 11 -; GCN-NEXT: v_writelane_b32 v0, s16, 12 -; GCN-NEXT: v_writelane_b32 v0, s17, 13 -; GCN-NEXT: v_writelane_b32 v0, s18, 14 -; GCN-NEXT: v_writelane_b32 v0, s19, 15 +; GCN-NEXT: v_writelane_b32 v1, s4, 0 +; GCN-NEXT: v_writelane_b32 v1, s5, 1 +; GCN-NEXT: v_writelane_b32 v1, s6, 2 +; GCN-NEXT: v_writelane_b32 v1, s7, 3 +; GCN-NEXT: v_writelane_b32 v1, s8, 4 +; GCN-NEXT: v_writelane_b32 v1, s9, 5 +; GCN-NEXT: v_writelane_b32 v1, s10, 6 +; GCN-NEXT: v_writelane_b32 v1, s11, 7 +; GCN-NEXT: v_writelane_b32 v1, s12, 8 +; GCN-NEXT: v_writelane_b32 v1, s13, 9 +; GCN-NEXT: v_writelane_b32 v1, s14, 10 +; GCN-NEXT: v_writelane_b32 v1, s15, 11 +; GCN-NEXT: v_writelane_b32 v1, s16, 12 +; GCN-NEXT: v_writelane_b32 v1, s17, 13 +; GCN-NEXT: v_writelane_b32 v1, s18, 14 +; GCN-NEXT: v_writelane_b32 v1, s19, 15 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v0, s4, 16 -; GCN-NEXT: v_writelane_b32 v0, s5, 17 -; GCN-NEXT: v_writelane_b32 v0, s6, 18 -; GCN-NEXT: v_writelane_b32 v0, s7, 19 -; GCN-NEXT: v_writelane_b32 v0, s8, 20 -; GCN-NEXT: v_writelane_b32 v0, s9, 21 -; GCN-NEXT: v_writelane_b32 v0, s10, 22 -; GCN-NEXT: v_writelane_b32 v0, s11, 23 -; GCN-NEXT: v_writelane_b32 v0, s12, 24 -; GCN-NEXT: v_writelane_b32 v0, s13, 25 -; GCN-NEXT: v_writelane_b32 v0, s14, 26 -; GCN-NEXT: v_writelane_b32 v0, s15, 27 -; GCN-NEXT: v_writelane_b32 v0, s16, 28 -; GCN-NEXT: v_writelane_b32 v0, s17, 29 -; GCN-NEXT: v_writelane_b32 v0, s18, 30 -; GCN-NEXT: v_writelane_b32 v0, s19, 31 +; GCN-NEXT: v_writelane_b32 v1, s4, 16 +; GCN-NEXT: v_writelane_b32 v1, s5, 17 +; GCN-NEXT: v_writelane_b32 v1, s6, 18 +; GCN-NEXT: v_writelane_b32 v1, s7, 19 +; GCN-NEXT: v_writelane_b32 v1, s8, 20 +; GCN-NEXT: v_writelane_b32 v1, s9, 21 +; GCN-NEXT: v_writelane_b32 v1, s10, 22 +; GCN-NEXT: v_writelane_b32 v1, s11, 23 +; GCN-NEXT: v_writelane_b32 v1, s12, 24 +; GCN-NEXT: v_writelane_b32 v1, s13, 25 +; GCN-NEXT: v_writelane_b32 v1, s14, 26 +; GCN-NEXT: v_writelane_b32 v1, s15, 27 +; GCN-NEXT: v_writelane_b32 v1, s16, 28 +; GCN-NEXT: v_writelane_b32 v1, s17, 29 +; GCN-NEXT: v_writelane_b32 v1, s18, 30 +; GCN-NEXT: v_writelane_b32 v1, s19, 31 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v0, s4, 32 -; GCN-NEXT: v_writelane_b32 v0, s5, 33 -; GCN-NEXT: v_writelane_b32 v0, s6, 34 -; GCN-NEXT: v_writelane_b32 v0, s7, 35 -; GCN-NEXT: v_writelane_b32 v0, s8, 36 -; GCN-NEXT: v_writelane_b32 v0, s9, 37 -; GCN-NEXT: v_writelane_b32 v0, s10, 38 -; GCN-NEXT: v_writelane_b32 v0, s11, 39 -; GCN-NEXT: v_writelane_b32 v0, s12, 40 -; GCN-NEXT: v_writelane_b32 v0, s13, 41 -; GCN-NEXT: v_writelane_b32 v0, s14, 42 -; GCN-NEXT: v_writelane_b32 v0, s15, 43 -; GCN-NEXT: v_writelane_b32 v0, s16, 44 -; GCN-NEXT: v_writelane_b32 v0, s17, 45 -; GCN-NEXT: v_writelane_b32 v0, s18, 46 -; GCN-NEXT: v_writelane_b32 v0, s19, 47 +; GCN-NEXT: v_writelane_b32 v1, s4, 32 +; GCN-NEXT: v_writelane_b32 v1, s5, 33 +; GCN-NEXT: v_writelane_b32 v1, s6, 34 +; GCN-NEXT: v_writelane_b32 v1, s7, 35 +; GCN-NEXT: v_writelane_b32 v1, s8, 36 +; GCN-NEXT: v_writelane_b32 v1, s9, 37 +; GCN-NEXT: v_writelane_b32 v1, s10, 38 +; GCN-NEXT: v_writelane_b32 v1, s11, 39 +; GCN-NEXT: v_writelane_b32 v1, s12, 40 +; GCN-NEXT: v_writelane_b32 v1, s13, 41 +; GCN-NEXT: v_writelane_b32 v1, s14, 42 +; GCN-NEXT: v_writelane_b32 v1, s15, 43 +; GCN-NEXT: v_writelane_b32 v1, s16, 44 +; GCN-NEXT: v_writelane_b32 v1, s17, 45 +; GCN-NEXT: v_writelane_b32 v1, s18, 46 +; GCN-NEXT: v_writelane_b32 v1, s19, 47 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v0, s4, 48 -; GCN-NEXT: v_writelane_b32 v0, s5, 49 -; GCN-NEXT: v_writelane_b32 v0, s6, 50 -; GCN-NEXT: v_writelane_b32 v0, s7, 51 -; GCN-NEXT: v_writelane_b32 v0, s8, 52 -; GCN-NEXT: v_writelane_b32 v0, s9, 53 -; GCN-NEXT: v_writelane_b32 v0, s10, 54 -; GCN-NEXT: v_writelane_b32 v0, s11, 55 -; GCN-NEXT: v_writelane_b32 v0, s12, 56 -; GCN-NEXT: v_writelane_b32 v0, s13, 57 -; GCN-NEXT: v_writelane_b32 v0, s14, 58 -; GCN-NEXT: v_writelane_b32 v0, s15, 59 -; GCN-NEXT: v_writelane_b32 v0, s16, 60 -; GCN-NEXT: v_writelane_b32 v0, s17, 61 -; GCN-NEXT: v_writelane_b32 v0, s18, 62 -; GCN-NEXT: v_writelane_b32 v0, s19, 63 +; GCN-NEXT: v_writelane_b32 v1, s4, 48 +; GCN-NEXT: v_writelane_b32 v1, s5, 49 +; GCN-NEXT: v_writelane_b32 v1, s6, 50 +; GCN-NEXT: v_writelane_b32 v1, s7, 51 +; GCN-NEXT: v_writelane_b32 v1, s8, 52 +; GCN-NEXT: v_writelane_b32 v1, s9, 53 +; GCN-NEXT: v_writelane_b32 v1, s10, 54 +; GCN-NEXT: v_writelane_b32 v1, s11, 55 +; GCN-NEXT: v_writelane_b32 v1, s12, 56 +; GCN-NEXT: v_writelane_b32 v1, s13, 57 +; GCN-NEXT: v_writelane_b32 v1, s14, 58 +; GCN-NEXT: v_writelane_b32 v1, s15, 59 +; GCN-NEXT: v_writelane_b32 v1, s16, 60 +; GCN-NEXT: v_writelane_b32 v1, s17, 61 +; GCN-NEXT: v_writelane_b32 v1, s18, 62 +; GCN-NEXT: v_writelane_b32 v1, s19, 63 +; GCN-NEXT: s_or_saveexec_b64 s[28:29], -1 +; GCN-NEXT: buffer_store_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[28:29] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 0 -; GCN-NEXT: v_writelane_b32 v1, s5, 1 -; GCN-NEXT: v_writelane_b32 v1, s6, 2 -; GCN-NEXT: v_writelane_b32 v1, s7, 3 -; GCN-NEXT: v_writelane_b32 v1, s8, 4 -; GCN-NEXT: v_writelane_b32 v1, s9, 5 -; GCN-NEXT: v_writelane_b32 v1, s10, 6 -; GCN-NEXT: v_writelane_b32 v1, s11, 7 +; GCN-NEXT: v_writelane_b32 v0, s4, 0 +; GCN-NEXT: v_writelane_b32 v0, s5, 1 +; GCN-NEXT: v_writelane_b32 v0, s6, 2 +; GCN-NEXT: v_writelane_b32 v0, s7, 3 +; GCN-NEXT: v_writelane_b32 v0, s8, 4 +; GCN-NEXT: v_writelane_b32 v0, s9, 5 +; GCN-NEXT: v_writelane_b32 v0, s10, 6 +; GCN-NEXT: v_writelane_b32 v0, s11, 7 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[2:3] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s2, 8 -; GCN-NEXT: v_writelane_b32 v1, s3, 9 +; GCN-NEXT: v_writelane_b32 v0, s2, 8 +; GCN-NEXT: v_writelane_b32 v0, s3, 9 +; GCN-NEXT: s_or_saveexec_b64 s[28:29], -1 +; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[28:29] ; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s0, s1 ; GCN-NEXT: s_cbranch_scc1 .LBB1_2 ; GCN-NEXT: ; %bb.1: ; %bb0 +; GCN-NEXT: s_or_saveexec_b64 s[28:29], -1 +; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[28:29] +; GCN-NEXT: s_or_saveexec_b64 s[28:29], -1 +; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[28:29] +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readlane_b32 s16, v1, 8 ; GCN-NEXT: v_readlane_b32 s17, v1, 9 ; GCN-NEXT: v_readlane_b32 s20, v1, 0 @@ -633,6 +694,14 @@ ; GCN-NEXT: ; use s[0:15] ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: .LBB1_2: ; %ret +; GCN-NEXT: s_or_saveexec_b64 s[28:29], -1 +; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[28:29] +; GCN-NEXT: s_or_saveexec_b64 s[28:29], -1 +; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[28:29] +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr0 ; GCN-NEXT: s_endpgm %wide.sgpr0 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 %wide.sgpr1 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 @@ -669,9 +738,17 @@ ; GCN-NEXT: s_mov_b32 s55, 0xe8f000 ; GCN-NEXT: s_add_u32 s52, s52, s11 ; GCN-NEXT: s_addc_u32 s53, s53, 0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: s_load_dword s0, s[4:5], 0xb ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[34:35] +; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: ;;#ASMSTART @@ -685,176 +762,176 @@ ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v31, s4, 0 -; GCN-NEXT: v_writelane_b32 v31, s5, 1 -; GCN-NEXT: v_writelane_b32 v31, s6, 2 -; GCN-NEXT: v_writelane_b32 v31, s7, 3 -; GCN-NEXT: v_writelane_b32 v31, s8, 4 -; GCN-NEXT: v_writelane_b32 v31, s9, 5 -; GCN-NEXT: v_writelane_b32 v31, s10, 6 -; GCN-NEXT: v_writelane_b32 v31, s11, 7 -; GCN-NEXT: v_writelane_b32 v31, s12, 8 -; GCN-NEXT: v_writelane_b32 v31, s13, 9 -; GCN-NEXT: v_writelane_b32 v31, s14, 10 -; GCN-NEXT: v_writelane_b32 v31, s15, 11 -; GCN-NEXT: v_writelane_b32 v31, s16, 12 -; GCN-NEXT: v_writelane_b32 v31, s17, 13 -; GCN-NEXT: v_writelane_b32 v31, s18, 14 -; GCN-NEXT: v_writelane_b32 v31, s19, 15 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_writelane_b32 v1, s4, 0 +; GCN-NEXT: v_writelane_b32 v1, s5, 1 +; GCN-NEXT: v_writelane_b32 v1, s6, 2 +; GCN-NEXT: v_writelane_b32 v1, s7, 3 +; GCN-NEXT: v_writelane_b32 v1, s8, 4 +; GCN-NEXT: v_writelane_b32 v1, s9, 5 +; GCN-NEXT: v_writelane_b32 v1, s10, 6 +; GCN-NEXT: v_writelane_b32 v1, s11, 7 +; GCN-NEXT: v_writelane_b32 v1, s12, 8 +; GCN-NEXT: v_writelane_b32 v1, s13, 9 +; GCN-NEXT: v_writelane_b32 v1, s14, 10 +; GCN-NEXT: v_writelane_b32 v1, s15, 11 +; GCN-NEXT: v_writelane_b32 v1, s16, 12 +; GCN-NEXT: v_writelane_b32 v1, s17, 13 +; GCN-NEXT: v_writelane_b32 v1, s18, 14 +; GCN-NEXT: v_writelane_b32 v1, s19, 15 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v31, s4, 16 -; GCN-NEXT: v_writelane_b32 v31, s5, 17 -; GCN-NEXT: v_writelane_b32 v31, s6, 18 -; GCN-NEXT: v_writelane_b32 v31, s7, 19 -; GCN-NEXT: v_writelane_b32 v31, s8, 20 -; GCN-NEXT: v_writelane_b32 v31, s9, 21 -; GCN-NEXT: v_writelane_b32 v31, s10, 22 -; GCN-NEXT: v_writelane_b32 v31, s11, 23 -; GCN-NEXT: v_writelane_b32 v31, s12, 24 -; GCN-NEXT: v_writelane_b32 v31, s13, 25 -; GCN-NEXT: v_writelane_b32 v31, s14, 26 -; GCN-NEXT: v_writelane_b32 v31, s15, 27 -; GCN-NEXT: v_writelane_b32 v31, s16, 28 -; GCN-NEXT: v_writelane_b32 v31, s17, 29 -; GCN-NEXT: v_writelane_b32 v31, s18, 30 -; GCN-NEXT: v_writelane_b32 v31, s19, 31 +; GCN-NEXT: v_writelane_b32 v1, s4, 16 +; GCN-NEXT: v_writelane_b32 v1, s5, 17 +; GCN-NEXT: v_writelane_b32 v1, s6, 18 +; GCN-NEXT: v_writelane_b32 v1, s7, 19 +; GCN-NEXT: v_writelane_b32 v1, s8, 20 +; GCN-NEXT: v_writelane_b32 v1, s9, 21 +; GCN-NEXT: v_writelane_b32 v1, s10, 22 +; GCN-NEXT: v_writelane_b32 v1, s11, 23 +; GCN-NEXT: v_writelane_b32 v1, s12, 24 +; GCN-NEXT: v_writelane_b32 v1, s13, 25 +; GCN-NEXT: v_writelane_b32 v1, s14, 26 +; GCN-NEXT: v_writelane_b32 v1, s15, 27 +; GCN-NEXT: v_writelane_b32 v1, s16, 28 +; GCN-NEXT: v_writelane_b32 v1, s17, 29 +; GCN-NEXT: v_writelane_b32 v1, s18, 30 +; GCN-NEXT: v_writelane_b32 v1, s19, 31 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v31, s4, 32 -; GCN-NEXT: v_writelane_b32 v31, s5, 33 -; GCN-NEXT: v_writelane_b32 v31, s6, 34 -; GCN-NEXT: v_writelane_b32 v31, s7, 35 -; GCN-NEXT: v_writelane_b32 v31, s8, 36 -; GCN-NEXT: v_writelane_b32 v31, s9, 37 -; GCN-NEXT: v_writelane_b32 v31, s10, 38 -; GCN-NEXT: v_writelane_b32 v31, s11, 39 -; GCN-NEXT: v_writelane_b32 v31, s12, 40 -; GCN-NEXT: v_writelane_b32 v31, s13, 41 -; GCN-NEXT: v_writelane_b32 v31, s14, 42 -; GCN-NEXT: v_writelane_b32 v31, s15, 43 -; GCN-NEXT: v_writelane_b32 v31, s16, 44 -; GCN-NEXT: v_writelane_b32 v31, s17, 45 -; GCN-NEXT: v_writelane_b32 v31, s18, 46 -; GCN-NEXT: v_writelane_b32 v31, s19, 47 +; GCN-NEXT: v_writelane_b32 v1, s4, 32 +; GCN-NEXT: v_writelane_b32 v1, s5, 33 +; GCN-NEXT: v_writelane_b32 v1, s6, 34 +; GCN-NEXT: v_writelane_b32 v1, s7, 35 +; GCN-NEXT: v_writelane_b32 v1, s8, 36 +; GCN-NEXT: v_writelane_b32 v1, s9, 37 +; GCN-NEXT: v_writelane_b32 v1, s10, 38 +; GCN-NEXT: v_writelane_b32 v1, s11, 39 +; GCN-NEXT: v_writelane_b32 v1, s12, 40 +; GCN-NEXT: v_writelane_b32 v1, s13, 41 +; GCN-NEXT: v_writelane_b32 v1, s14, 42 +; GCN-NEXT: v_writelane_b32 v1, s15, 43 +; GCN-NEXT: v_writelane_b32 v1, s16, 44 +; GCN-NEXT: v_writelane_b32 v1, s17, 45 +; GCN-NEXT: v_writelane_b32 v1, s18, 46 +; GCN-NEXT: v_writelane_b32 v1, s19, 47 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v31, s4, 48 -; GCN-NEXT: v_writelane_b32 v31, s5, 49 -; GCN-NEXT: v_writelane_b32 v31, s6, 50 -; GCN-NEXT: v_writelane_b32 v31, s7, 51 -; GCN-NEXT: v_writelane_b32 v31, s8, 52 -; GCN-NEXT: v_writelane_b32 v31, s9, 53 -; GCN-NEXT: v_writelane_b32 v31, s10, 54 -; GCN-NEXT: v_writelane_b32 v31, s11, 55 -; GCN-NEXT: v_writelane_b32 v31, s12, 56 -; GCN-NEXT: v_writelane_b32 v31, s13, 57 -; GCN-NEXT: v_writelane_b32 v31, s14, 58 -; GCN-NEXT: v_writelane_b32 v31, s15, 59 -; GCN-NEXT: v_writelane_b32 v31, s16, 60 -; GCN-NEXT: v_writelane_b32 v31, s17, 61 -; GCN-NEXT: v_writelane_b32 v31, s18, 62 -; GCN-NEXT: v_writelane_b32 v31, s19, 63 +; GCN-NEXT: v_writelane_b32 v1, s4, 48 +; GCN-NEXT: v_writelane_b32 v1, s5, 49 +; GCN-NEXT: v_writelane_b32 v1, s6, 50 +; GCN-NEXT: v_writelane_b32 v1, s7, 51 +; GCN-NEXT: v_writelane_b32 v1, s8, 52 +; GCN-NEXT: v_writelane_b32 v1, s9, 53 +; GCN-NEXT: v_writelane_b32 v1, s10, 54 +; GCN-NEXT: v_writelane_b32 v1, s11, 55 +; GCN-NEXT: v_writelane_b32 v1, s12, 56 +; GCN-NEXT: v_writelane_b32 v1, s13, 57 +; GCN-NEXT: v_writelane_b32 v1, s14, 58 +; GCN-NEXT: v_writelane_b32 v1, s15, 59 +; GCN-NEXT: v_writelane_b32 v1, s16, 60 +; GCN-NEXT: v_writelane_b32 v1, s17, 61 +; GCN-NEXT: v_writelane_b32 v1, s18, 62 +; GCN-NEXT: v_writelane_b32 v1, s19, 63 +; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GCN-NEXT: buffer_store_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[2:3] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_mov_b64 s[4:5], exec -; GCN-NEXT: s_mov_b64 exec, 3 -; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 +; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_writelane_b32 v0, s2, 0 ; GCN-NEXT: v_writelane_b32 v0, s3, 1 +; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s0, s1 ; GCN-NEXT: s_cbranch_scc1 .LBB2_2 ; GCN-NEXT: ; %bb.1: ; %bb0 -; GCN-NEXT: v_readlane_b32 s36, v31, 32 -; GCN-NEXT: v_readlane_b32 s37, v31, 33 -; GCN-NEXT: v_readlane_b32 s38, v31, 34 -; GCN-NEXT: v_readlane_b32 s39, v31, 35 -; GCN-NEXT: v_readlane_b32 s40, v31, 36 -; GCN-NEXT: v_readlane_b32 s41, v31, 37 -; GCN-NEXT: v_readlane_b32 s42, v31, 38 -; GCN-NEXT: v_readlane_b32 s43, v31, 39 -; GCN-NEXT: v_readlane_b32 s44, v31, 40 -; GCN-NEXT: v_readlane_b32 s45, v31, 41 -; GCN-NEXT: v_readlane_b32 s46, v31, 42 -; GCN-NEXT: v_readlane_b32 s47, v31, 43 -; GCN-NEXT: v_readlane_b32 s48, v31, 44 -; GCN-NEXT: v_readlane_b32 s49, v31, 45 -; GCN-NEXT: v_readlane_b32 s50, v31, 46 -; GCN-NEXT: v_readlane_b32 s51, v31, 47 -; GCN-NEXT: v_readlane_b32 s0, v31, 16 -; GCN-NEXT: v_readlane_b32 s1, v31, 17 -; GCN-NEXT: v_readlane_b32 s2, v31, 18 -; GCN-NEXT: v_readlane_b32 s3, v31, 19 -; GCN-NEXT: v_readlane_b32 s4, v31, 20 -; GCN-NEXT: v_readlane_b32 s5, v31, 21 -; GCN-NEXT: v_readlane_b32 s6, v31, 22 -; GCN-NEXT: v_readlane_b32 s7, v31, 23 -; GCN-NEXT: v_readlane_b32 s8, v31, 24 -; GCN-NEXT: v_readlane_b32 s9, v31, 25 -; GCN-NEXT: v_readlane_b32 s10, v31, 26 -; GCN-NEXT: v_readlane_b32 s11, v31, 27 -; GCN-NEXT: v_readlane_b32 s12, v31, 28 -; GCN-NEXT: v_readlane_b32 s13, v31, 29 -; GCN-NEXT: v_readlane_b32 s14, v31, 30 -; GCN-NEXT: v_readlane_b32 s15, v31, 31 -; GCN-NEXT: v_readlane_b32 s16, v31, 0 -; GCN-NEXT: v_readlane_b32 s17, v31, 1 -; GCN-NEXT: v_readlane_b32 s18, v31, 2 -; GCN-NEXT: v_readlane_b32 s19, v31, 3 -; GCN-NEXT: v_readlane_b32 s20, v31, 4 -; GCN-NEXT: v_readlane_b32 s21, v31, 5 -; GCN-NEXT: v_readlane_b32 s22, v31, 6 -; GCN-NEXT: v_readlane_b32 s23, v31, 7 -; GCN-NEXT: v_readlane_b32 s24, v31, 8 -; GCN-NEXT: v_readlane_b32 s25, v31, 9 -; GCN-NEXT: v_readlane_b32 s26, v31, 10 -; GCN-NEXT: v_readlane_b32 s27, v31, 11 -; GCN-NEXT: v_readlane_b32 s28, v31, 12 -; GCN-NEXT: v_readlane_b32 s29, v31, 13 -; GCN-NEXT: v_readlane_b32 s30, v31, 14 -; GCN-NEXT: v_readlane_b32 s31, v31, 15 +; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[34:35] +; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[34:35] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readlane_b32 s36, v1, 32 +; GCN-NEXT: v_readlane_b32 s37, v1, 33 +; GCN-NEXT: v_readlane_b32 s38, v1, 34 +; GCN-NEXT: v_readlane_b32 s39, v1, 35 +; GCN-NEXT: v_readlane_b32 s40, v1, 36 +; GCN-NEXT: v_readlane_b32 s41, v1, 37 +; GCN-NEXT: v_readlane_b32 s42, v1, 38 +; GCN-NEXT: v_readlane_b32 s43, v1, 39 +; GCN-NEXT: v_readlane_b32 s44, v1, 40 +; GCN-NEXT: v_readlane_b32 s45, v1, 41 +; GCN-NEXT: v_readlane_b32 s46, v1, 42 +; GCN-NEXT: v_readlane_b32 s47, v1, 43 +; GCN-NEXT: v_readlane_b32 s48, v1, 44 +; GCN-NEXT: v_readlane_b32 s49, v1, 45 +; GCN-NEXT: v_readlane_b32 s50, v1, 46 +; GCN-NEXT: v_readlane_b32 s51, v1, 47 +; GCN-NEXT: v_readlane_b32 s0, v1, 16 +; GCN-NEXT: v_readlane_b32 s1, v1, 17 +; GCN-NEXT: v_readlane_b32 s2, v1, 18 +; GCN-NEXT: v_readlane_b32 s3, v1, 19 +; GCN-NEXT: v_readlane_b32 s4, v1, 20 +; GCN-NEXT: v_readlane_b32 s5, v1, 21 +; GCN-NEXT: v_readlane_b32 s6, v1, 22 +; GCN-NEXT: v_readlane_b32 s7, v1, 23 +; GCN-NEXT: v_readlane_b32 s8, v1, 24 +; GCN-NEXT: v_readlane_b32 s9, v1, 25 +; GCN-NEXT: v_readlane_b32 s10, v1, 26 +; GCN-NEXT: v_readlane_b32 s11, v1, 27 +; GCN-NEXT: v_readlane_b32 s12, v1, 28 +; GCN-NEXT: v_readlane_b32 s13, v1, 29 +; GCN-NEXT: v_readlane_b32 s14, v1, 30 +; GCN-NEXT: v_readlane_b32 s15, v1, 31 +; GCN-NEXT: v_readlane_b32 s16, v1, 0 +; GCN-NEXT: v_readlane_b32 s17, v1, 1 +; GCN-NEXT: v_readlane_b32 s18, v1, 2 +; GCN-NEXT: v_readlane_b32 s19, v1, 3 +; GCN-NEXT: v_readlane_b32 s20, v1, 4 +; GCN-NEXT: v_readlane_b32 s21, v1, 5 +; GCN-NEXT: v_readlane_b32 s22, v1, 6 +; GCN-NEXT: v_readlane_b32 s23, v1, 7 +; GCN-NEXT: v_readlane_b32 s24, v1, 8 +; GCN-NEXT: v_readlane_b32 s25, v1, 9 +; GCN-NEXT: v_readlane_b32 s26, v1, 10 +; GCN-NEXT: v_readlane_b32 s27, v1, 11 +; GCN-NEXT: v_readlane_b32 s28, v1, 12 +; GCN-NEXT: v_readlane_b32 s29, v1, 13 +; GCN-NEXT: v_readlane_b32 s30, v1, 14 +; GCN-NEXT: v_readlane_b32 s31, v1, 15 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[16:31] ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:15] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s4, v31, 48 -; GCN-NEXT: v_readlane_b32 s5, v31, 49 -; GCN-NEXT: v_readlane_b32 s6, v31, 50 -; GCN-NEXT: v_readlane_b32 s7, v31, 51 -; GCN-NEXT: v_readlane_b32 s8, v31, 52 -; GCN-NEXT: v_readlane_b32 s9, v31, 53 -; GCN-NEXT: v_readlane_b32 s10, v31, 54 -; GCN-NEXT: v_readlane_b32 s11, v31, 55 -; GCN-NEXT: v_readlane_b32 s12, v31, 56 -; GCN-NEXT: v_readlane_b32 s13, v31, 57 -; GCN-NEXT: v_readlane_b32 s14, v31, 58 -; GCN-NEXT: v_readlane_b32 s15, v31, 59 -; GCN-NEXT: v_readlane_b32 s16, v31, 60 -; GCN-NEXT: v_readlane_b32 s17, v31, 61 -; GCN-NEXT: v_readlane_b32 s18, v31, 62 -; GCN-NEXT: v_readlane_b32 s19, v31, 63 -; GCN-NEXT: s_mov_b64 s[2:3], exec -; GCN-NEXT: s_mov_b64 exec, 3 -; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 -; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readlane_b32 s4, v1, 48 +; GCN-NEXT: v_readlane_b32 s5, v1, 49 +; GCN-NEXT: v_readlane_b32 s6, v1, 50 +; GCN-NEXT: v_readlane_b32 s7, v1, 51 +; GCN-NEXT: v_readlane_b32 s8, v1, 52 +; GCN-NEXT: v_readlane_b32 s9, v1, 53 +; GCN-NEXT: v_readlane_b32 s10, v1, 54 +; GCN-NEXT: v_readlane_b32 s11, v1, 55 +; GCN-NEXT: v_readlane_b32 s12, v1, 56 +; GCN-NEXT: v_readlane_b32 s13, v1, 57 +; GCN-NEXT: v_readlane_b32 s14, v1, 58 +; GCN-NEXT: v_readlane_b32 s15, v1, 59 +; GCN-NEXT: v_readlane_b32 s16, v1, 60 +; GCN-NEXT: v_readlane_b32 s17, v1, 61 +; GCN-NEXT: v_readlane_b32 s18, v1, 62 +; GCN-NEXT: v_readlane_b32 s19, v1, 63 ; GCN-NEXT: v_readlane_b32 s0, v0, 0 ; GCN-NEXT: v_readlane_b32 s1, v0, 1 -; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_mov_b64 exec, s[2:3] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[36:51] ; GCN-NEXT: ;;#ASMEND @@ -865,6 +942,14 @@ ; GCN-NEXT: ; use s[0:1] ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: .LBB2_2: ; %ret +; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[34:35] +; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[34:35] +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr0 ; GCN-NEXT: s_endpgm call void asm sideeffect "", "~{v[0:7]}" () #0 call void asm sideeffect "", "~{v[8:15]}" () #0 @@ -904,9 +989,17 @@ ; GCN-NEXT: s_mov_b32 s55, 0xe8f000 ; GCN-NEXT: s_add_u32 s52, s52, s11 ; GCN-NEXT: s_addc_u32 s53, s53, 0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: s_load_dword s0, s[4:5], 0x9 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[34:35] +; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: ;;#ASMSTART @@ -920,144 +1013,152 @@ ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v31, s4, 0 -; GCN-NEXT: v_writelane_b32 v31, s5, 1 -; GCN-NEXT: v_writelane_b32 v31, s6, 2 -; GCN-NEXT: v_writelane_b32 v31, s7, 3 -; GCN-NEXT: v_writelane_b32 v31, s8, 4 -; GCN-NEXT: v_writelane_b32 v31, s9, 5 -; GCN-NEXT: v_writelane_b32 v31, s10, 6 -; GCN-NEXT: v_writelane_b32 v31, s11, 7 -; GCN-NEXT: v_writelane_b32 v31, s12, 8 -; GCN-NEXT: v_writelane_b32 v31, s13, 9 -; GCN-NEXT: v_writelane_b32 v31, s14, 10 -; GCN-NEXT: v_writelane_b32 v31, s15, 11 -; GCN-NEXT: v_writelane_b32 v31, s16, 12 -; GCN-NEXT: v_writelane_b32 v31, s17, 13 -; GCN-NEXT: v_writelane_b32 v31, s18, 14 -; GCN-NEXT: v_writelane_b32 v31, s19, 15 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_writelane_b32 v1, s4, 0 +; GCN-NEXT: v_writelane_b32 v1, s5, 1 +; GCN-NEXT: v_writelane_b32 v1, s6, 2 +; GCN-NEXT: v_writelane_b32 v1, s7, 3 +; GCN-NEXT: v_writelane_b32 v1, s8, 4 +; GCN-NEXT: v_writelane_b32 v1, s9, 5 +; GCN-NEXT: v_writelane_b32 v1, s10, 6 +; GCN-NEXT: v_writelane_b32 v1, s11, 7 +; GCN-NEXT: v_writelane_b32 v1, s12, 8 +; GCN-NEXT: v_writelane_b32 v1, s13, 9 +; GCN-NEXT: v_writelane_b32 v1, s14, 10 +; GCN-NEXT: v_writelane_b32 v1, s15, 11 +; GCN-NEXT: v_writelane_b32 v1, s16, 12 +; GCN-NEXT: v_writelane_b32 v1, s17, 13 +; GCN-NEXT: v_writelane_b32 v1, s18, 14 +; GCN-NEXT: v_writelane_b32 v1, s19, 15 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v31, s4, 16 -; GCN-NEXT: v_writelane_b32 v31, s5, 17 -; GCN-NEXT: v_writelane_b32 v31, s6, 18 -; GCN-NEXT: v_writelane_b32 v31, s7, 19 -; GCN-NEXT: v_writelane_b32 v31, s8, 20 -; GCN-NEXT: v_writelane_b32 v31, s9, 21 -; GCN-NEXT: v_writelane_b32 v31, s10, 22 -; GCN-NEXT: v_writelane_b32 v31, s11, 23 -; GCN-NEXT: v_writelane_b32 v31, s12, 24 -; GCN-NEXT: v_writelane_b32 v31, s13, 25 -; GCN-NEXT: v_writelane_b32 v31, s14, 26 -; GCN-NEXT: v_writelane_b32 v31, s15, 27 -; GCN-NEXT: v_writelane_b32 v31, s16, 28 -; GCN-NEXT: v_writelane_b32 v31, s17, 29 -; GCN-NEXT: v_writelane_b32 v31, s18, 30 -; GCN-NEXT: v_writelane_b32 v31, s19, 31 +; GCN-NEXT: v_writelane_b32 v1, s4, 16 +; GCN-NEXT: v_writelane_b32 v1, s5, 17 +; GCN-NEXT: v_writelane_b32 v1, s6, 18 +; GCN-NEXT: v_writelane_b32 v1, s7, 19 +; GCN-NEXT: v_writelane_b32 v1, s8, 20 +; GCN-NEXT: v_writelane_b32 v1, s9, 21 +; GCN-NEXT: v_writelane_b32 v1, s10, 22 +; GCN-NEXT: v_writelane_b32 v1, s11, 23 +; GCN-NEXT: v_writelane_b32 v1, s12, 24 +; GCN-NEXT: v_writelane_b32 v1, s13, 25 +; GCN-NEXT: v_writelane_b32 v1, s14, 26 +; GCN-NEXT: v_writelane_b32 v1, s15, 27 +; GCN-NEXT: v_writelane_b32 v1, s16, 28 +; GCN-NEXT: v_writelane_b32 v1, s17, 29 +; GCN-NEXT: v_writelane_b32 v1, s18, 30 +; GCN-NEXT: v_writelane_b32 v1, s19, 31 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v31, s4, 32 -; GCN-NEXT: v_writelane_b32 v31, s5, 33 -; GCN-NEXT: v_writelane_b32 v31, s6, 34 -; GCN-NEXT: v_writelane_b32 v31, s7, 35 -; GCN-NEXT: v_writelane_b32 v31, s8, 36 -; GCN-NEXT: v_writelane_b32 v31, s9, 37 -; GCN-NEXT: v_writelane_b32 v31, s10, 38 -; GCN-NEXT: v_writelane_b32 v31, s11, 39 -; GCN-NEXT: v_writelane_b32 v31, s12, 40 -; GCN-NEXT: v_writelane_b32 v31, s13, 41 -; GCN-NEXT: v_writelane_b32 v31, s14, 42 -; GCN-NEXT: v_writelane_b32 v31, s15, 43 -; GCN-NEXT: v_writelane_b32 v31, s16, 44 -; GCN-NEXT: v_writelane_b32 v31, s17, 45 -; GCN-NEXT: v_writelane_b32 v31, s18, 46 -; GCN-NEXT: v_writelane_b32 v31, s19, 47 +; GCN-NEXT: v_writelane_b32 v1, s4, 32 +; GCN-NEXT: v_writelane_b32 v1, s5, 33 +; GCN-NEXT: v_writelane_b32 v1, s6, 34 +; GCN-NEXT: v_writelane_b32 v1, s7, 35 +; GCN-NEXT: v_writelane_b32 v1, s8, 36 +; GCN-NEXT: v_writelane_b32 v1, s9, 37 +; GCN-NEXT: v_writelane_b32 v1, s10, 38 +; GCN-NEXT: v_writelane_b32 v1, s11, 39 +; GCN-NEXT: v_writelane_b32 v1, s12, 40 +; GCN-NEXT: v_writelane_b32 v1, s13, 41 +; GCN-NEXT: v_writelane_b32 v1, s14, 42 +; GCN-NEXT: v_writelane_b32 v1, s15, 43 +; GCN-NEXT: v_writelane_b32 v1, s16, 44 +; GCN-NEXT: v_writelane_b32 v1, s17, 45 +; GCN-NEXT: v_writelane_b32 v1, s18, 46 +; GCN-NEXT: v_writelane_b32 v1, s19, 47 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v31, s4, 48 -; GCN-NEXT: v_writelane_b32 v31, s5, 49 -; GCN-NEXT: v_writelane_b32 v31, s6, 50 -; GCN-NEXT: v_writelane_b32 v31, s7, 51 -; GCN-NEXT: v_writelane_b32 v31, s8, 52 -; GCN-NEXT: v_writelane_b32 v31, s9, 53 -; GCN-NEXT: v_writelane_b32 v31, s10, 54 -; GCN-NEXT: v_writelane_b32 v31, s11, 55 -; GCN-NEXT: v_writelane_b32 v31, s12, 56 -; GCN-NEXT: v_writelane_b32 v31, s13, 57 -; GCN-NEXT: v_writelane_b32 v31, s14, 58 -; GCN-NEXT: v_writelane_b32 v31, s15, 59 -; GCN-NEXT: v_writelane_b32 v31, s16, 60 -; GCN-NEXT: v_writelane_b32 v31, s17, 61 -; GCN-NEXT: v_writelane_b32 v31, s18, 62 -; GCN-NEXT: v_writelane_b32 v31, s19, 63 +; GCN-NEXT: v_writelane_b32 v1, s4, 48 +; GCN-NEXT: v_writelane_b32 v1, s5, 49 +; GCN-NEXT: v_writelane_b32 v1, s6, 50 +; GCN-NEXT: v_writelane_b32 v1, s7, 51 +; GCN-NEXT: v_writelane_b32 v1, s8, 52 +; GCN-NEXT: v_writelane_b32 v1, s9, 53 +; GCN-NEXT: v_writelane_b32 v1, s10, 54 +; GCN-NEXT: v_writelane_b32 v1, s11, 55 +; GCN-NEXT: v_writelane_b32 v1, s12, 56 +; GCN-NEXT: v_writelane_b32 v1, s13, 57 +; GCN-NEXT: v_writelane_b32 v1, s14, 58 +; GCN-NEXT: v_writelane_b32 v1, s15, 59 +; GCN-NEXT: v_writelane_b32 v1, s16, 60 +; GCN-NEXT: v_writelane_b32 v1, s17, 61 +; GCN-NEXT: v_writelane_b32 v1, s18, 62 +; GCN-NEXT: v_writelane_b32 v1, s19, 63 +; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GCN-NEXT: buffer_store_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[2:3] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_mov_b64 s[4:5], exec -; GCN-NEXT: s_mov_b64 exec, 3 -; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 +; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_writelane_b32 v0, s2, 0 ; GCN-NEXT: v_writelane_b32 v0, s3, 1 +; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s0, s1 ; GCN-NEXT: s_cbranch_scc1 .LBB3_2 ; GCN-NEXT: ; %bb.1: ; %bb0 -; GCN-NEXT: v_readlane_b32 s36, v31, 32 -; GCN-NEXT: v_readlane_b32 s37, v31, 33 -; GCN-NEXT: v_readlane_b32 s38, v31, 34 -; GCN-NEXT: v_readlane_b32 s39, v31, 35 -; GCN-NEXT: v_readlane_b32 s40, v31, 36 -; GCN-NEXT: v_readlane_b32 s41, v31, 37 -; GCN-NEXT: v_readlane_b32 s42, v31, 38 -; GCN-NEXT: v_readlane_b32 s43, v31, 39 -; GCN-NEXT: v_readlane_b32 s44, v31, 40 -; GCN-NEXT: v_readlane_b32 s45, v31, 41 -; GCN-NEXT: v_readlane_b32 s46, v31, 42 -; GCN-NEXT: v_readlane_b32 s47, v31, 43 -; GCN-NEXT: v_readlane_b32 s48, v31, 44 -; GCN-NEXT: v_readlane_b32 s49, v31, 45 -; GCN-NEXT: v_readlane_b32 s50, v31, 46 -; GCN-NEXT: v_readlane_b32 s51, v31, 47 -; GCN-NEXT: v_readlane_b32 s0, v31, 16 -; GCN-NEXT: v_readlane_b32 s1, v31, 17 -; GCN-NEXT: v_readlane_b32 s2, v31, 18 -; GCN-NEXT: v_readlane_b32 s3, v31, 19 -; GCN-NEXT: v_readlane_b32 s4, v31, 20 -; GCN-NEXT: v_readlane_b32 s5, v31, 21 -; GCN-NEXT: v_readlane_b32 s6, v31, 22 -; GCN-NEXT: v_readlane_b32 s7, v31, 23 -; GCN-NEXT: v_readlane_b32 s8, v31, 24 -; GCN-NEXT: v_readlane_b32 s9, v31, 25 -; GCN-NEXT: v_readlane_b32 s10, v31, 26 -; GCN-NEXT: v_readlane_b32 s11, v31, 27 -; GCN-NEXT: v_readlane_b32 s12, v31, 28 -; GCN-NEXT: v_readlane_b32 s13, v31, 29 -; GCN-NEXT: v_readlane_b32 s14, v31, 30 -; GCN-NEXT: v_readlane_b32 s15, v31, 31 -; GCN-NEXT: v_readlane_b32 s16, v31, 0 -; GCN-NEXT: v_readlane_b32 s17, v31, 1 -; GCN-NEXT: v_readlane_b32 s18, v31, 2 -; GCN-NEXT: v_readlane_b32 s19, v31, 3 -; GCN-NEXT: v_readlane_b32 s20, v31, 4 -; GCN-NEXT: v_readlane_b32 s21, v31, 5 -; GCN-NEXT: v_readlane_b32 s22, v31, 6 -; GCN-NEXT: v_readlane_b32 s23, v31, 7 -; GCN-NEXT: v_readlane_b32 s24, v31, 8 -; GCN-NEXT: v_readlane_b32 s25, v31, 9 -; GCN-NEXT: v_readlane_b32 s26, v31, 10 -; GCN-NEXT: v_readlane_b32 s27, v31, 11 -; GCN-NEXT: v_readlane_b32 s28, v31, 12 -; GCN-NEXT: v_readlane_b32 s29, v31, 13 -; GCN-NEXT: v_readlane_b32 s30, v31, 14 -; GCN-NEXT: v_readlane_b32 s31, v31, 15 +; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[34:35] +; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GCN-NEXT: buffer_load_dword v2, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[34:35] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readlane_b32 s36, v2, 32 +; GCN-NEXT: v_readlane_b32 s37, v2, 33 +; GCN-NEXT: v_readlane_b32 s38, v2, 34 +; GCN-NEXT: v_readlane_b32 s39, v2, 35 +; GCN-NEXT: v_readlane_b32 s40, v2, 36 +; GCN-NEXT: v_readlane_b32 s41, v2, 37 +; GCN-NEXT: v_readlane_b32 s42, v2, 38 +; GCN-NEXT: v_readlane_b32 s43, v2, 39 +; GCN-NEXT: v_readlane_b32 s44, v2, 40 +; GCN-NEXT: v_readlane_b32 s45, v2, 41 +; GCN-NEXT: v_readlane_b32 s46, v2, 42 +; GCN-NEXT: v_readlane_b32 s47, v2, 43 +; GCN-NEXT: v_readlane_b32 s48, v2, 44 +; GCN-NEXT: v_readlane_b32 s49, v2, 45 +; GCN-NEXT: v_readlane_b32 s50, v2, 46 +; GCN-NEXT: v_readlane_b32 s51, v2, 47 +; GCN-NEXT: v_readlane_b32 s0, v2, 16 +; GCN-NEXT: v_readlane_b32 s1, v2, 17 +; GCN-NEXT: v_readlane_b32 s2, v2, 18 +; GCN-NEXT: v_readlane_b32 s3, v2, 19 +; GCN-NEXT: v_readlane_b32 s4, v2, 20 +; GCN-NEXT: v_readlane_b32 s5, v2, 21 +; GCN-NEXT: v_readlane_b32 s6, v2, 22 +; GCN-NEXT: v_readlane_b32 s7, v2, 23 +; GCN-NEXT: v_readlane_b32 s8, v2, 24 +; GCN-NEXT: v_readlane_b32 s9, v2, 25 +; GCN-NEXT: v_readlane_b32 s10, v2, 26 +; GCN-NEXT: v_readlane_b32 s11, v2, 27 +; GCN-NEXT: v_readlane_b32 s12, v2, 28 +; GCN-NEXT: v_readlane_b32 s13, v2, 29 +; GCN-NEXT: v_readlane_b32 s14, v2, 30 +; GCN-NEXT: v_readlane_b32 s15, v2, 31 +; GCN-NEXT: v_readlane_b32 s16, v2, 0 +; GCN-NEXT: v_readlane_b32 s17, v2, 1 +; GCN-NEXT: v_readlane_b32 s18, v2, 2 +; GCN-NEXT: v_readlane_b32 s19, v2, 3 +; GCN-NEXT: v_readlane_b32 s20, v2, 4 +; GCN-NEXT: v_readlane_b32 s21, v2, 5 +; GCN-NEXT: v_readlane_b32 s22, v2, 6 +; GCN-NEXT: v_readlane_b32 s23, v2, 7 +; GCN-NEXT: v_readlane_b32 s24, v2, 8 +; GCN-NEXT: v_readlane_b32 s25, v2, 9 +; GCN-NEXT: v_readlane_b32 s26, v2, 10 +; GCN-NEXT: v_readlane_b32 s27, v2, 11 +; GCN-NEXT: v_readlane_b32 s28, v2, 12 +; GCN-NEXT: v_readlane_b32 s29, v2, 13 +; GCN-NEXT: v_readlane_b32 s30, v2, 14 +; GCN-NEXT: v_readlane_b32 s31, v2, 15 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def v0 ; GCN-NEXT: ;;#ASMEND @@ -1067,32 +1168,24 @@ ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:15] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s4, v31, 48 -; GCN-NEXT: v_readlane_b32 s5, v31, 49 -; GCN-NEXT: v_readlane_b32 s6, v31, 50 -; GCN-NEXT: v_readlane_b32 s7, v31, 51 -; GCN-NEXT: v_readlane_b32 s8, v31, 52 -; GCN-NEXT: v_readlane_b32 s9, v31, 53 -; GCN-NEXT: v_readlane_b32 s10, v31, 54 -; GCN-NEXT: v_readlane_b32 s11, v31, 55 -; GCN-NEXT: v_readlane_b32 s12, v31, 56 -; GCN-NEXT: v_readlane_b32 s13, v31, 57 -; GCN-NEXT: v_readlane_b32 s14, v31, 58 -; GCN-NEXT: v_readlane_b32 s15, v31, 59 -; GCN-NEXT: v_readlane_b32 s16, v31, 60 -; GCN-NEXT: v_readlane_b32 s17, v31, 61 -; GCN-NEXT: v_readlane_b32 s18, v31, 62 -; GCN-NEXT: v_readlane_b32 s19, v31, 63 -; GCN-NEXT: s_mov_b64 s[2:3], exec -; GCN-NEXT: s_mov_b64 exec, 3 -; GCN-NEXT: buffer_store_dword v1, off, s[52:55], 0 -; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readlane_b32 s4, v2, 48 +; GCN-NEXT: v_readlane_b32 s5, v2, 49 +; GCN-NEXT: v_readlane_b32 s6, v2, 50 +; GCN-NEXT: v_readlane_b32 s7, v2, 51 +; GCN-NEXT: v_readlane_b32 s8, v2, 52 +; GCN-NEXT: v_readlane_b32 s9, v2, 53 +; GCN-NEXT: v_readlane_b32 s10, v2, 54 +; GCN-NEXT: v_readlane_b32 s11, v2, 55 +; GCN-NEXT: v_readlane_b32 s12, v2, 56 +; GCN-NEXT: v_readlane_b32 s13, v2, 57 +; GCN-NEXT: v_readlane_b32 s14, v2, 58 +; GCN-NEXT: v_readlane_b32 s15, v2, 59 +; GCN-NEXT: v_readlane_b32 s16, v2, 60 +; GCN-NEXT: v_readlane_b32 s17, v2, 61 +; GCN-NEXT: v_readlane_b32 s18, v2, 62 +; GCN-NEXT: v_readlane_b32 s19, v2, 63 ; GCN-NEXT: v_readlane_b32 s0, v1, 0 ; GCN-NEXT: v_readlane_b32 s1, v1, 1 -; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_mov_b64 exec, s[2:3] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[36:51] ; GCN-NEXT: ;;#ASMEND @@ -1106,6 +1199,14 @@ ; GCN-NEXT: ; use v0 ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: .LBB3_2: ; %ret +; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[34:35] +; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[34:35] +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr0 ; GCN-NEXT: s_endpgm call void asm sideeffect "", "~{v[0:7]}" () #0 call void asm sideeffect "", "~{v[8:15]}" () #0 diff --git a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll @@ -0,0 +1,816 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX906 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX908 %s + +; Due to high register pressure, regalloc would split the liverange of wwm VGPR register used for SGPR spills +; and introduce a copy. The copy should be of whole-wave with exec mask manipulation around it. +; FIXME: The destination register involved in the whole-wave copy should be considered for preserving all the lanes +; with a spill/restore at function prolog/epilog. The copy might otherwise clobber its inactive lanes unwantedly. +define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { +; GFX906-LABEL: preserve_wwm_copy_dstreg: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: s_mov_b32 s16, s33 +; GFX906-NEXT: s_mov_b32 s33, s32 +; GFX906-NEXT: s_xor_saveexec_b64 s[18:19], -1 +; GFX906-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:144 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:152 ; 4-byte Folded Spill +; GFX906-NEXT: s_mov_b64 exec, -1 +; GFX906-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:148 ; 4-byte Folded Spill +; GFX906-NEXT: s_mov_b64 exec, s[18:19] +; GFX906-NEXT: s_mov_b32 s21, s15 +; GFX906-NEXT: ; implicit-def: $vgpr2 +; GFX906-NEXT: s_mov_b32 s22, s14 +; GFX906-NEXT: v_writelane_b32 v2, s21, 0 +; GFX906-NEXT: v_writelane_b32 v2, s22, 1 +; GFX906-NEXT: s_mov_b32 s23, s13 +; GFX906-NEXT: v_writelane_b32 v2, s23, 2 +; GFX906-NEXT: s_mov_b32 s24, s12 +; GFX906-NEXT: v_writelane_b32 v2, s24, 3 +; GFX906-NEXT: s_mov_b64 s[26:27], s[10:11] +; GFX906-NEXT: v_writelane_b32 v2, s26, 4 +; GFX906-NEXT: v_writelane_b32 v2, s27, 5 +; GFX906-NEXT: v_writelane_b32 v2, s8, 6 +; GFX906-NEXT: v_writelane_b32 v2, s9, 7 +; GFX906-NEXT: v_writelane_b32 v2, s6, 8 +; GFX906-NEXT: v_writelane_b32 v41, s16, 2 +; GFX906-NEXT: v_writelane_b32 v2, s7, 9 +; GFX906-NEXT: v_writelane_b32 v41, s30, 0 +; GFX906-NEXT: v_writelane_b32 v2, s4, 10 +; GFX906-NEXT: s_addk_i32 s32, 0x2800 +; GFX906-NEXT: v_writelane_b32 v41, s31, 1 +; GFX906-NEXT: v_mov_b32_e32 v32, v31 +; GFX906-NEXT: v_writelane_b32 v2, s5, 11 +; GFX906-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX906-NEXT: v_mov_b32_e32 v33, v2 +; GFX906-NEXT: s_mov_b64 exec, s[34:35] +; GFX906-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX906-NEXT: ;;#ASMSTART +; GFX906-NEXT: ; def v[0:31] +; GFX906-NEXT: ;;#ASMEND +; GFX906-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:64 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:68 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:72 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:76 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:80 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:84 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:88 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:92 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:96 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:100 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:104 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:108 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:112 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:116 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:120 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:124 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:136 ; 4-byte Folded Spill +; GFX906-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:140 ; 4-byte Folded Spill +; GFX906-NEXT: ;;#ASMSTART +; GFX906-NEXT: ; def v40 +; GFX906-NEXT: ;;#ASMEND +; GFX906-NEXT: ;;#ASMSTART +; GFX906-NEXT: ; def s11 +; GFX906-NEXT: ;;#ASMEND +; GFX906-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX906-NEXT: v_mov_b32_e32 v40, v33 +; GFX906-NEXT: s_mov_b64 exec, s[34:35] +; GFX906-NEXT: v_writelane_b32 v40, s11, 12 +; GFX906-NEXT: ;;#ASMSTART +; GFX906-NEXT: ; def s12 +; GFX906-NEXT: ;;#ASMEND +; GFX906-NEXT: v_writelane_b32 v40, s12, 13 +; GFX906-NEXT: ;;#ASMSTART +; GFX906-NEXT: ; def s13 +; GFX906-NEXT: ;;#ASMEND +; GFX906-NEXT: v_writelane_b32 v40, s13, 14 +; GFX906-NEXT: ;;#ASMSTART +; GFX906-NEXT: ; def s14 +; GFX906-NEXT: ;;#ASMEND +; GFX906-NEXT: v_writelane_b32 v40, s14, 15 +; GFX906-NEXT: ;;#ASMSTART +; GFX906-NEXT: ; def s15 +; GFX906-NEXT: ;;#ASMEND +; GFX906-NEXT: v_writelane_b32 v40, s15, 16 +; GFX906-NEXT: s_getpc_b64 s[10:11] +; GFX906-NEXT: s_add_u32 s10, s10, foo@gotpcrel32@lo+4 +; GFX906-NEXT: s_addc_u32 s11, s11, foo@gotpcrel32@hi+12 +; GFX906-NEXT: ;;#ASMSTART +; GFX906-NEXT: ; def s16 +; GFX906-NEXT: ;;#ASMEND +; GFX906-NEXT: v_writelane_b32 v40, s16, 17 +; GFX906-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX906-NEXT: ;;#ASMSTART +; GFX906-NEXT: ; def s17 +; GFX906-NEXT: ;;#ASMEND +; GFX906-NEXT: v_writelane_b32 v40, s17, 18 +; GFX906-NEXT: ;;#ASMSTART +; GFX906-NEXT: ; def s18 +; GFX906-NEXT: ;;#ASMEND +; GFX906-NEXT: v_writelane_b32 v40, s18, 19 +; GFX906-NEXT: ;;#ASMSTART +; GFX906-NEXT: ; def s19 +; GFX906-NEXT: ;;#ASMEND +; GFX906-NEXT: v_writelane_b32 v40, s19, 20 +; GFX906-NEXT: ;;#ASMSTART +; GFX906-NEXT: ; def s20 +; GFX906-NEXT: ;;#ASMEND +; GFX906-NEXT: v_writelane_b32 v40, s20, 21 +; GFX906-NEXT: s_waitcnt lgkmcnt(0) +; GFX906-NEXT: v_writelane_b32 v40, s10, 22 +; GFX906-NEXT: v_writelane_b32 v40, s11, 23 +; GFX906-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX906-NEXT: s_mov_b64 exec, s[34:35] +; GFX906-NEXT: v_readlane_b32 s16, v40, 22 +; GFX906-NEXT: s_mov_b32 s12, s24 +; GFX906-NEXT: s_mov_b32 s13, s23 +; GFX906-NEXT: s_mov_b32 s14, s22 +; GFX906-NEXT: v_mov_b32_e32 v31, v32 +; GFX906-NEXT: s_mov_b32 s15, s21 +; GFX906-NEXT: s_mov_b64 s[10:11], s[26:27] +; GFX906-NEXT: v_readlane_b32 s17, v40, 23 +; GFX906-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX906-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX906-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX906-NEXT: s_mov_b64 exec, s[34:35] +; GFX906-NEXT: v_readlane_b32 s11, v40, 12 +; GFX906-NEXT: ;;#ASMSTART +; GFX906-NEXT: ; use s11 +; GFX906-NEXT: ;;#ASMEND +; GFX906-NEXT: v_readlane_b32 s12, v40, 13 +; GFX906-NEXT: ;;#ASMSTART +; GFX906-NEXT: ; use s12 +; GFX906-NEXT: ;;#ASMEND +; GFX906-NEXT: v_readlane_b32 s13, v40, 14 +; GFX906-NEXT: ;;#ASMSTART +; GFX906-NEXT: ; use s13 +; GFX906-NEXT: ;;#ASMEND +; GFX906-NEXT: v_readlane_b32 s14, v40, 15 +; GFX906-NEXT: ;;#ASMSTART +; GFX906-NEXT: ; use s14 +; GFX906-NEXT: ;;#ASMEND +; GFX906-NEXT: v_readlane_b32 s15, v40, 16 +; GFX906-NEXT: ;;#ASMSTART +; GFX906-NEXT: ; use s15 +; GFX906-NEXT: ;;#ASMEND +; GFX906-NEXT: v_readlane_b32 s16, v40, 17 +; GFX906-NEXT: ;;#ASMSTART +; GFX906-NEXT: ; use s16 +; GFX906-NEXT: ;;#ASMEND +; GFX906-NEXT: v_readlane_b32 s17, v40, 18 +; GFX906-NEXT: ;;#ASMSTART +; GFX906-NEXT: ; use s17 +; GFX906-NEXT: ;;#ASMEND +; GFX906-NEXT: v_readlane_b32 s18, v40, 19 +; GFX906-NEXT: ;;#ASMSTART +; GFX906-NEXT: ; use s18 +; GFX906-NEXT: ;;#ASMEND +; GFX906-NEXT: v_readlane_b32 s19, v40, 20 +; GFX906-NEXT: ;;#ASMSTART +; GFX906-NEXT: ; use s19 +; GFX906-NEXT: ;;#ASMEND +; GFX906-NEXT: v_readlane_b32 s20, v40, 21 +; GFX906-NEXT: ;;#ASMSTART +; GFX906-NEXT: ; use s20 +; GFX906-NEXT: ;;#ASMEND +; GFX906-NEXT: ;;#ASMSTART +; GFX906-NEXT: ; def s21 +; GFX906-NEXT: ;;#ASMEND +; GFX906-NEXT: ;;#ASMSTART +; GFX906-NEXT: ; def s22 +; GFX906-NEXT: ;;#ASMEND +; GFX906-NEXT: ;;#ASMSTART +; GFX906-NEXT: ; def s23 +; GFX906-NEXT: ;;#ASMEND +; GFX906-NEXT: ;;#ASMSTART +; GFX906-NEXT: ; def s24 +; GFX906-NEXT: ;;#ASMEND +; GFX906-NEXT: ;;#ASMSTART +; GFX906-NEXT: ; def s25 +; GFX906-NEXT: ;;#ASMEND +; GFX906-NEXT: ;;#ASMSTART +; GFX906-NEXT: ; def s26 +; GFX906-NEXT: ;;#ASMEND +; GFX906-NEXT: ;;#ASMSTART +; GFX906-NEXT: ; def s27 +; GFX906-NEXT: ;;#ASMEND +; GFX906-NEXT: ;;#ASMSTART +; GFX906-NEXT: ; def s28 +; GFX906-NEXT: ;;#ASMEND +; GFX906-NEXT: ;;#ASMSTART +; GFX906-NEXT: ; def s29 +; GFX906-NEXT: ;;#ASMEND +; GFX906-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX906-NEXT: v_writelane_b32 v40, s21, 24 +; GFX906-NEXT: v_writelane_b32 v40, s22, 25 +; GFX906-NEXT: v_writelane_b32 v40, s23, 26 +; GFX906-NEXT: v_writelane_b32 v40, s24, 27 +; GFX906-NEXT: v_writelane_b32 v40, s25, 28 +; GFX906-NEXT: v_writelane_b32 v40, s26, 29 +; GFX906-NEXT: v_writelane_b32 v40, s27, 30 +; GFX906-NEXT: v_writelane_b32 v40, s28, 31 +; GFX906-NEXT: v_writelane_b32 v40, s29, 32 +; GFX906-NEXT: v_readlane_b32 s4, v40, 10 +; GFX906-NEXT: v_readlane_b32 s6, v40, 8 +; GFX906-NEXT: v_readlane_b32 s8, v40, 6 +; GFX906-NEXT: v_readlane_b32 s10, v40, 4 +; GFX906-NEXT: v_readlane_b32 s16, v40, 22 +; GFX906-NEXT: v_readlane_b32 s12, v40, 3 +; GFX906-NEXT: v_readlane_b32 s13, v40, 2 +; GFX906-NEXT: v_readlane_b32 s14, v40, 1 +; GFX906-NEXT: v_readlane_b32 s15, v40, 0 +; GFX906-NEXT: v_readlane_b32 s5, v40, 11 +; GFX906-NEXT: v_readlane_b32 s7, v40, 9 +; GFX906-NEXT: v_readlane_b32 s9, v40, 7 +; GFX906-NEXT: v_readlane_b32 s11, v40, 5 +; GFX906-NEXT: v_readlane_b32 s17, v40, 23 +; GFX906-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX906-NEXT: s_mov_b64 exec, s[34:35] +; GFX906-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX906-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX906-NEXT: s_mov_b64 exec, s[34:35] +; GFX906-NEXT: v_readlane_b32 s21, v40, 24 +; GFX906-NEXT: ;;#ASMSTART +; GFX906-NEXT: ; use s21 +; GFX906-NEXT: ;;#ASMEND +; GFX906-NEXT: v_readlane_b32 s22, v40, 25 +; GFX906-NEXT: ;;#ASMSTART +; GFX906-NEXT: ; use s22 +; GFX906-NEXT: ;;#ASMEND +; GFX906-NEXT: v_readlane_b32 s23, v40, 26 +; GFX906-NEXT: ;;#ASMSTART +; GFX906-NEXT: ; use s23 +; GFX906-NEXT: ;;#ASMEND +; GFX906-NEXT: v_readlane_b32 s24, v40, 27 +; GFX906-NEXT: ;;#ASMSTART +; GFX906-NEXT: ; use s24 +; GFX906-NEXT: ;;#ASMEND +; GFX906-NEXT: v_readlane_b32 s25, v40, 28 +; GFX906-NEXT: ;;#ASMSTART +; GFX906-NEXT: ; use s25 +; GFX906-NEXT: ;;#ASMEND +; GFX906-NEXT: v_readlane_b32 s26, v40, 29 +; GFX906-NEXT: ;;#ASMSTART +; GFX906-NEXT: ; use s26 +; GFX906-NEXT: ;;#ASMEND +; GFX906-NEXT: v_readlane_b32 s27, v40, 30 +; GFX906-NEXT: ;;#ASMSTART +; GFX906-NEXT: ; use s27 +; GFX906-NEXT: ;;#ASMEND +; GFX906-NEXT: v_readlane_b32 s28, v40, 31 +; GFX906-NEXT: ;;#ASMSTART +; GFX906-NEXT: ; use s28 +; GFX906-NEXT: ;;#ASMEND +; GFX906-NEXT: v_readlane_b32 s29, v40, 32 +; GFX906-NEXT: ;;#ASMSTART +; GFX906-NEXT: ; use s29 +; GFX906-NEXT: ;;#ASMEND +; GFX906-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX906-NEXT: v_readlane_b32 s4, v40, 10 +; GFX906-NEXT: v_readlane_b32 s6, v40, 8 +; GFX906-NEXT: v_readlane_b32 s8, v40, 6 +; GFX906-NEXT: v_readlane_b32 s10, v40, 4 +; GFX906-NEXT: v_readlane_b32 s16, v40, 22 +; GFX906-NEXT: v_readlane_b32 s5, v40, 11 +; GFX906-NEXT: v_readlane_b32 s7, v40, 9 +; GFX906-NEXT: v_readlane_b32 s9, v40, 7 +; GFX906-NEXT: v_readlane_b32 s11, v40, 5 +; GFX906-NEXT: v_readlane_b32 s12, v40, 3 +; GFX906-NEXT: v_readlane_b32 s13, v40, 2 +; GFX906-NEXT: v_readlane_b32 s14, v40, 1 +; GFX906-NEXT: v_readlane_b32 s15, v40, 0 +; GFX906-NEXT: v_readlane_b32 s17, v40, 23 +; GFX906-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX906-NEXT: s_mov_b64 exec, s[34:35] +; GFX906-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX906-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:64 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:68 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:72 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:76 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:80 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:84 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:88 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:92 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:96 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:100 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:104 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:108 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:112 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:116 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:120 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:124 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:140 ; 4-byte Folded Reload +; GFX906-NEXT: v_readlane_b32 s31, v41, 1 +; GFX906-NEXT: v_readlane_b32 s30, v41, 0 +; GFX906-NEXT: ; kill: killed $vgpr40 +; GFX906-NEXT: v_readlane_b32 s4, v41, 2 +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[30:33] offset:112 +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[26:29] offset:96 +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[22:25] offset:80 +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[18:21] offset:64 +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[14:17] offset:48 +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[10:13] offset:32 +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[6:9] offset:16 +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: s_xor_saveexec_b64 s[6:7], -1 +; GFX906-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:144 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:152 ; 4-byte Folded Reload +; GFX906-NEXT: s_mov_b64 exec, -1 +; GFX906-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:148 ; 4-byte Folded Reload +; GFX906-NEXT: s_mov_b64 exec, s[6:7] +; GFX906-NEXT: s_addk_i32 s32, 0xd800 +; GFX906-NEXT: s_mov_b32 s33, s4 +; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX908-LABEL: preserve_wwm_copy_dstreg: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: s_mov_b32 s16, s33 +; GFX908-NEXT: s_mov_b32 s33, s32 +; GFX908-NEXT: s_xor_saveexec_b64 s[18:19], -1 +; GFX908-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:148 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:156 ; 4-byte Folded Spill +; GFX908-NEXT: s_mov_b64 exec, -1 +; GFX908-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:152 ; 4-byte Folded Spill +; GFX908-NEXT: s_mov_b64 exec, s[18:19] +; GFX908-NEXT: v_mov_b32_e32 v3, s16 +; GFX908-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:160 ; 4-byte Folded Spill +; GFX908-NEXT: s_addk_i32 s32, 0x2c00 +; GFX908-NEXT: s_mov_b64 s[16:17], exec +; GFX908-NEXT: s_mov_b64 exec, 1 +; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:164 +; GFX908-NEXT: v_writelane_b32 v2, s30, 0 +; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:164 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: s_mov_b64 exec, s[16:17] +; GFX908-NEXT: s_mov_b64 s[16:17], exec +; GFX908-NEXT: s_mov_b64 exec, 1 +; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:164 +; GFX908-NEXT: v_writelane_b32 v2, s31, 0 +; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:164 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: s_mov_b64 exec, s[16:17] +; GFX908-NEXT: s_mov_b32 s21, s15 +; GFX908-NEXT: ; implicit-def: $vgpr2 +; GFX908-NEXT: s_mov_b32 s22, s14 +; GFX908-NEXT: v_writelane_b32 v2, s21, 0 +; GFX908-NEXT: v_writelane_b32 v2, s22, 1 +; GFX908-NEXT: s_mov_b32 s23, s13 +; GFX908-NEXT: v_writelane_b32 v2, s23, 2 +; GFX908-NEXT: s_mov_b32 s24, s12 +; GFX908-NEXT: v_writelane_b32 v2, s24, 3 +; GFX908-NEXT: s_mov_b64 s[26:27], s[10:11] +; GFX908-NEXT: v_writelane_b32 v2, s26, 4 +; GFX908-NEXT: v_writelane_b32 v2, s27, 5 +; GFX908-NEXT: v_writelane_b32 v2, s8, 6 +; GFX908-NEXT: v_writelane_b32 v2, s9, 7 +; GFX908-NEXT: v_writelane_b32 v2, s6, 8 +; GFX908-NEXT: v_writelane_b32 v2, s7, 9 +; GFX908-NEXT: v_writelane_b32 v2, s4, 10 +; GFX908-NEXT: v_mov_b32_e32 v32, v31 +; GFX908-NEXT: v_writelane_b32 v2, s5, 11 +; GFX908-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX908-NEXT: v_mov_b32_e32 v33, v2 +; GFX908-NEXT: s_mov_b64 exec, s[34:35] +; GFX908-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ; def v[0:31] +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:64 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:68 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:72 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:76 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:80 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:84 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:88 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:92 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:96 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:100 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:104 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:108 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:112 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:116 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:120 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:124 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:136 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:140 ; 4-byte Folded Spill +; GFX908-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:144 ; 4-byte Folded Spill +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ; def v40 +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ; def s11 +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX908-NEXT: v_mov_b32_e32 v40, v33 +; GFX908-NEXT: s_mov_b64 exec, s[34:35] +; GFX908-NEXT: v_writelane_b32 v40, s11, 12 +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ; def s12 +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: v_writelane_b32 v40, s12, 13 +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ; def s13 +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: v_writelane_b32 v40, s13, 14 +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ; def s14 +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: v_writelane_b32 v40, s14, 15 +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ; def s15 +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: v_writelane_b32 v40, s15, 16 +; GFX908-NEXT: s_getpc_b64 s[10:11] +; GFX908-NEXT: s_add_u32 s10, s10, foo@gotpcrel32@lo+4 +; GFX908-NEXT: s_addc_u32 s11, s11, foo@gotpcrel32@hi+12 +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ; def s16 +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: v_writelane_b32 v40, s16, 17 +; GFX908-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ; def s17 +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: v_writelane_b32 v40, s17, 18 +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ; def s18 +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: v_writelane_b32 v40, s18, 19 +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ; def s19 +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: v_writelane_b32 v40, s19, 20 +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ; def s20 +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: v_writelane_b32 v40, s20, 21 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: v_writelane_b32 v40, s10, 22 +; GFX908-NEXT: v_writelane_b32 v40, s11, 23 +; GFX908-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX908-NEXT: s_mov_b64 exec, s[34:35] +; GFX908-NEXT: v_readlane_b32 s16, v40, 22 +; GFX908-NEXT: s_mov_b32 s12, s24 +; GFX908-NEXT: s_mov_b32 s13, s23 +; GFX908-NEXT: s_mov_b32 s14, s22 +; GFX908-NEXT: v_mov_b32_e32 v31, v32 +; GFX908-NEXT: s_mov_b32 s15, s21 +; GFX908-NEXT: s_mov_b64 s[10:11], s[26:27] +; GFX908-NEXT: v_readlane_b32 s17, v40, 23 +; GFX908-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX908-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX908-NEXT: s_mov_b64 exec, s[34:35] +; GFX908-NEXT: v_readlane_b32 s11, v40, 12 +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ; use s11 +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: v_readlane_b32 s12, v40, 13 +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ; use s12 +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: v_readlane_b32 s13, v40, 14 +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ; use s13 +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: v_readlane_b32 s14, v40, 15 +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ; use s14 +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: v_readlane_b32 s15, v40, 16 +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ; use s15 +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: v_readlane_b32 s16, v40, 17 +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ; use s16 +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: v_readlane_b32 s17, v40, 18 +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ; use s17 +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: v_readlane_b32 s18, v40, 19 +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ; use s18 +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: v_readlane_b32 s19, v40, 20 +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ; use s19 +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: v_readlane_b32 s20, v40, 21 +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ; use s20 +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ; def s21 +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ; def s22 +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ; def s23 +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ; def s24 +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ; def s25 +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ; def s26 +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ; def s27 +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ; def s28 +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ; def s29 +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX908-NEXT: v_writelane_b32 v40, s21, 24 +; GFX908-NEXT: v_writelane_b32 v40, s22, 25 +; GFX908-NEXT: v_writelane_b32 v40, s23, 26 +; GFX908-NEXT: v_writelane_b32 v40, s24, 27 +; GFX908-NEXT: v_writelane_b32 v40, s25, 28 +; GFX908-NEXT: v_writelane_b32 v40, s26, 29 +; GFX908-NEXT: v_writelane_b32 v40, s27, 30 +; GFX908-NEXT: v_writelane_b32 v40, s28, 31 +; GFX908-NEXT: v_writelane_b32 v40, s29, 32 +; GFX908-NEXT: v_readlane_b32 s4, v40, 10 +; GFX908-NEXT: v_readlane_b32 s6, v40, 8 +; GFX908-NEXT: v_readlane_b32 s8, v40, 6 +; GFX908-NEXT: v_readlane_b32 s10, v40, 4 +; GFX908-NEXT: v_readlane_b32 s16, v40, 22 +; GFX908-NEXT: v_readlane_b32 s12, v40, 3 +; GFX908-NEXT: v_readlane_b32 s13, v40, 2 +; GFX908-NEXT: v_readlane_b32 s14, v40, 1 +; GFX908-NEXT: v_readlane_b32 s15, v40, 0 +; GFX908-NEXT: v_readlane_b32 s5, v40, 11 +; GFX908-NEXT: v_readlane_b32 s7, v40, 9 +; GFX908-NEXT: v_readlane_b32 s9, v40, 7 +; GFX908-NEXT: v_readlane_b32 s11, v40, 5 +; GFX908-NEXT: v_readlane_b32 s17, v40, 23 +; GFX908-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX908-NEXT: s_mov_b64 exec, s[34:35] +; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX908-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX908-NEXT: s_mov_b64 exec, s[34:35] +; GFX908-NEXT: v_readlane_b32 s21, v40, 24 +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ; use s21 +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: v_readlane_b32 s22, v40, 25 +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ; use s22 +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: v_readlane_b32 s23, v40, 26 +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ; use s23 +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: v_readlane_b32 s24, v40, 27 +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ; use s24 +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: v_readlane_b32 s25, v40, 28 +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ; use s25 +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: v_readlane_b32 s26, v40, 29 +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ; use s26 +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: v_readlane_b32 s27, v40, 30 +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ; use s27 +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: v_readlane_b32 s28, v40, 31 +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ; use s28 +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: v_readlane_b32 s29, v40, 32 +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ; use s29 +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX908-NEXT: v_readlane_b32 s4, v40, 10 +; GFX908-NEXT: v_readlane_b32 s6, v40, 8 +; GFX908-NEXT: v_readlane_b32 s8, v40, 6 +; GFX908-NEXT: v_readlane_b32 s10, v40, 4 +; GFX908-NEXT: v_readlane_b32 s16, v40, 22 +; GFX908-NEXT: v_readlane_b32 s5, v40, 11 +; GFX908-NEXT: v_readlane_b32 s7, v40, 9 +; GFX908-NEXT: v_readlane_b32 s9, v40, 7 +; GFX908-NEXT: v_readlane_b32 s11, v40, 5 +; GFX908-NEXT: v_readlane_b32 s12, v40, 3 +; GFX908-NEXT: v_readlane_b32 s13, v40, 2 +; GFX908-NEXT: v_readlane_b32 s14, v40, 1 +; GFX908-NEXT: v_readlane_b32 s15, v40, 0 +; GFX908-NEXT: v_readlane_b32 s17, v40, 23 +; GFX908-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GFX908-NEXT: s_mov_b64 exec, s[34:35] +; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:64 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:68 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:72 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:76 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:80 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:84 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:88 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:92 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:96 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:100 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:104 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:108 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:112 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:116 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:120 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:124 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:140 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:144 ; 4-byte Folded Reload +; GFX908-NEXT: s_mov_b64 s[4:5], exec +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[30:33] offset:112 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[26:29] offset:96 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[22:25] offset:80 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[18:21] offset:64 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[14:17] offset:48 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[10:13] offset:32 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[6:9] offset:16 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: s_mov_b64 exec, 1 +; GFX908-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:164 +; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_readlane_b32 s31, v0, 0 +; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:164 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: s_mov_b64 exec, s[4:5] +; GFX908-NEXT: s_mov_b64 s[4:5], exec +; GFX908-NEXT: s_mov_b64 exec, 1 +; GFX908-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:164 +; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_readlane_b32 s30, v0, 0 +; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:164 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: s_mov_b64 exec, s[4:5] +; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:160 ; 4-byte Folded Reload +; GFX908-NEXT: ; kill: killed $vgpr40 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_readfirstlane_b32 s4, v0 +; GFX908-NEXT: s_xor_saveexec_b64 s[6:7], -1 +; GFX908-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:148 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:156 ; 4-byte Folded Reload +; GFX908-NEXT: s_mov_b64 exec, -1 +; GFX908-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:152 ; 4-byte Folded Reload +; GFX908-NEXT: s_mov_b64 exec, s[6:7] +; GFX908-NEXT: s_addk_i32 s32, 0xd400 +; GFX908-NEXT: s_mov_b32 s33, s4 +; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX908-NEXT: s_setpc_b64 s[30:31] + %vreg0 = call <32 x float> asm sideeffect "; def $0", "=v"() + %v40 = call i32 asm sideeffect "; def $0","=${v40}"() + + %s11 = call i32 asm sideeffect "; def $0","=${s11}"() + %s12 = call i32 asm sideeffect "; def $0","=${s12}"() + %s13 = call i32 asm sideeffect "; def $0","=${s13}"() + %s14 = call i32 asm sideeffect "; def $0","=${s14}"() + %s15 = call i32 asm sideeffect "; def $0","=${s15}"() + %s16 = call i32 asm sideeffect "; def $0","=${s16}"() + %s17 = call i32 asm sideeffect "; def $0","=${s17}"() + %s18 = call i32 asm sideeffect "; def $0","=${s18}"() + %s19 = call i32 asm sideeffect "; def $0","=${s19}"() + %s20 = call i32 asm sideeffect "; def $0","=${s20}"() + call void @foo() + call void asm sideeffect "; use $0","${s11}"(i32 %s11) + call void asm sideeffect "; use $0","${s12}"(i32 %s12) + call void asm sideeffect "; use $0","${s13}"(i32 %s13) + call void asm sideeffect "; use $0","${s14}"(i32 %s14) + call void asm sideeffect "; use $0","${s15}"(i32 %s15) + call void asm sideeffect "; use $0","${s16}"(i32 %s16) + call void asm sideeffect "; use $0","${s17}"(i32 %s17) + call void asm sideeffect "; use $0","${s18}"(i32 %s18) + call void asm sideeffect "; use $0","${s19}"(i32 %s19) + call void asm sideeffect "; use $0","${s20}"(i32 %s20) + + %s21 = call i32 asm sideeffect "; def $0","=${s21}"() + %s22 = call i32 asm sideeffect "; def $0","=${s22}"() + %s23 = call i32 asm sideeffect "; def $0","=${s23}"() + %s24 = call i32 asm sideeffect "; def $0","=${s24}"() + %s25 = call i32 asm sideeffect "; def $0","=${s25}"() + %s26 = call i32 asm sideeffect "; def $0","=${s26}"() + %s27 = call i32 asm sideeffect "; def $0","=${s27}"() + %s28 = call i32 asm sideeffect "; def $0","=${s28}"() + %s29 = call i32 asm sideeffect "; def $0","=${s29}"() + call void @foo() + call void asm sideeffect "; use $0","${s21}"(i32 %s21) + call void asm sideeffect "; use $0","${s22}"(i32 %s22) + call void asm sideeffect "; use $0","${s23}"(i32 %s23) + call void asm sideeffect "; use $0","${s24}"(i32 %s24) + call void asm sideeffect "; use $0","${s25}"(i32 %s25) + call void asm sideeffect "; use $0","${s26}"(i32 %s26) + call void asm sideeffect "; use $0","${s27}"(i32 %s27) + call void asm sideeffect "; use $0","${s28}"(i32 %s28) + call void asm sideeffect "; use $0","${s29}"(i32 %s29) + + call void @foo() + + store volatile <32 x float> %vreg0, ptr %parg0 + + ret void +} + +declare void @foo() + +attributes #0 = { "amdgpu-num-vgpr"="42" "amdgpu-num-sgpr"="40"} diff --git a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll --- a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll @@ -1,22 +1,381 @@ -; RUN: not --crash llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -; This ends up needing to spill SGPRs to memory, and also does not -; have any free SGPRs available to save the exec mask when doing so. -; The register scavenger also needs to use the emergency stack slot, -; which tries to place the scavenged register restore instruction as -; far the block as possible, near the terminator. This places a -; restore instruction between the condition and the conditional -; branch, which gets expanded into a sequence involving s_not_b64 on -; the exec mask, clobbering SCC value before the branch. We probably -; have to stop relying on being able to flip and restore the exec -; mask, and always require a free SGPR for saving exec. +; This was a negative test to catch an extreme case when all options are exhausted +; while trying to spill SGPRs to memory. After we enabled SGPR spills into virtual VGPRs +; the edge case won't arise and the test would always compile. -; CHECK: *** Bad machine code: Using an undefined physical register *** -; CHECK-NEXT: - function: kernel0 -; CHECK-NEXT: - basic block: %bb.0 -; CHECK-NEXT: - instruction: S_CBRANCH_SCC1 %bb.2, implicit killed $scc -; CHECK-NEXT: - operand 1: implicit killed $scc define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { +; CHECK-LABEL: kernel0: +; CHECK: ; %bb.0: +; CHECK-NEXT: ; implicit-def: $vgpr23 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[2:3] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v23, s2, 0 +; CHECK-NEXT: s_load_dword s0, s[4:5], 0x8 +; CHECK-NEXT: v_writelane_b32 v23, s3, 1 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[4:7] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v23, s4, 2 +; CHECK-NEXT: v_writelane_b32 v23, s5, 3 +; CHECK-NEXT: v_writelane_b32 v23, s6, 4 +; CHECK-NEXT: v_writelane_b32 v23, s7, 5 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[4:11] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v23, s4, 6 +; CHECK-NEXT: v_writelane_b32 v23, s5, 7 +; CHECK-NEXT: v_writelane_b32 v23, s6, 8 +; CHECK-NEXT: v_writelane_b32 v23, s7, 9 +; CHECK-NEXT: v_writelane_b32 v23, s8, 10 +; CHECK-NEXT: v_writelane_b32 v23, s9, 11 +; CHECK-NEXT: v_writelane_b32 v23, s10, 12 +; CHECK-NEXT: v_writelane_b32 v23, s11, 13 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[4:19] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v23, s4, 14 +; CHECK-NEXT: v_writelane_b32 v23, s5, 15 +; CHECK-NEXT: v_writelane_b32 v23, s6, 16 +; CHECK-NEXT: v_writelane_b32 v23, s7, 17 +; CHECK-NEXT: v_writelane_b32 v23, s8, 18 +; CHECK-NEXT: v_writelane_b32 v23, s9, 19 +; CHECK-NEXT: v_writelane_b32 v23, s10, 20 +; CHECK-NEXT: v_writelane_b32 v23, s11, 21 +; CHECK-NEXT: v_writelane_b32 v23, s12, 22 +; CHECK-NEXT: v_writelane_b32 v23, s13, 23 +; CHECK-NEXT: v_writelane_b32 v23, s14, 24 +; CHECK-NEXT: v_writelane_b32 v23, s15, 25 +; CHECK-NEXT: v_writelane_b32 v23, s16, 26 +; CHECK-NEXT: v_writelane_b32 v23, s17, 27 +; CHECK-NEXT: v_writelane_b32 v23, s18, 28 +; CHECK-NEXT: v_writelane_b32 v23, s19, 29 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[2:3] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v23, s2, 30 +; CHECK-NEXT: v_writelane_b32 v23, s3, 31 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[4:7] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v23, s4, 32 +; CHECK-NEXT: v_writelane_b32 v23, s5, 33 +; CHECK-NEXT: v_writelane_b32 v23, s6, 34 +; CHECK-NEXT: v_writelane_b32 v23, s7, 35 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[4:11] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v23, s4, 36 +; CHECK-NEXT: v_writelane_b32 v23, s5, 37 +; CHECK-NEXT: v_writelane_b32 v23, s6, 38 +; CHECK-NEXT: v_writelane_b32 v23, s7, 39 +; CHECK-NEXT: v_writelane_b32 v23, s8, 40 +; CHECK-NEXT: v_writelane_b32 v23, s9, 41 +; CHECK-NEXT: v_writelane_b32 v23, s10, 42 +; CHECK-NEXT: v_writelane_b32 v23, s11, 43 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[16:31] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[52:53] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[48:51] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[36:43] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[0:15] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v23, s0, 44 +; CHECK-NEXT: v_writelane_b32 v23, s1, 45 +; CHECK-NEXT: v_writelane_b32 v23, s2, 46 +; CHECK-NEXT: v_writelane_b32 v23, s3, 47 +; CHECK-NEXT: v_writelane_b32 v23, s4, 48 +; CHECK-NEXT: v_writelane_b32 v23, s5, 49 +; CHECK-NEXT: v_writelane_b32 v23, s6, 50 +; CHECK-NEXT: v_writelane_b32 v23, s7, 51 +; CHECK-NEXT: v_writelane_b32 v23, s8, 52 +; CHECK-NEXT: v_writelane_b32 v23, s9, 53 +; CHECK-NEXT: v_writelane_b32 v23, s10, 54 +; CHECK-NEXT: v_writelane_b32 v23, s11, 55 +; CHECK-NEXT: v_writelane_b32 v23, s12, 56 +; CHECK-NEXT: v_writelane_b32 v23, s13, 57 +; CHECK-NEXT: v_writelane_b32 v23, s14, 58 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: v_writelane_b32 v23, s15, 59 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[34:35] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[44:47] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[0:7] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v23, s0, 60 +; CHECK-NEXT: v_writelane_b32 v0, s4, 0 +; CHECK-NEXT: v_writelane_b32 v23, s1, 61 +; CHECK-NEXT: v_writelane_b32 v0, s5, 1 +; CHECK-NEXT: v_writelane_b32 v23, s2, 62 +; CHECK-NEXT: v_writelane_b32 v0, s6, 2 +; CHECK-NEXT: v_writelane_b32 v23, s3, 63 +; CHECK-NEXT: v_writelane_b32 v0, s7, 3 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[0:15] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v0, s0, 4 +; CHECK-NEXT: v_writelane_b32 v0, s1, 5 +; CHECK-NEXT: v_writelane_b32 v0, s2, 6 +; CHECK-NEXT: v_writelane_b32 v0, s3, 7 +; CHECK-NEXT: v_writelane_b32 v0, s4, 8 +; CHECK-NEXT: v_writelane_b32 v0, s5, 9 +; CHECK-NEXT: v_writelane_b32 v0, s6, 10 +; CHECK-NEXT: v_writelane_b32 v0, s7, 11 +; CHECK-NEXT: v_writelane_b32 v0, s8, 12 +; CHECK-NEXT: v_writelane_b32 v0, s9, 13 +; CHECK-NEXT: v_writelane_b32 v0, s10, 14 +; CHECK-NEXT: v_writelane_b32 v0, s11, 15 +; CHECK-NEXT: v_writelane_b32 v0, s12, 16 +; CHECK-NEXT: v_writelane_b32 v0, s13, 17 +; CHECK-NEXT: v_writelane_b32 v0, s14, 18 +; CHECK-NEXT: v_writelane_b32 v0, s15, 19 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[54:55] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[0:3] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v0, s0, 20 +; CHECK-NEXT: v_writelane_b32 v0, s1, 21 +; CHECK-NEXT: v_writelane_b32 v0, s2, 22 +; CHECK-NEXT: v_writelane_b32 v0, s3, 23 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[0:7] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v0, s0, 24 +; CHECK-NEXT: v_writelane_b32 v0, s1, 25 +; CHECK-NEXT: v_writelane_b32 v0, s2, 26 +; CHECK-NEXT: v_writelane_b32 v0, s3, 27 +; CHECK-NEXT: v_writelane_b32 v0, s4, 28 +; CHECK-NEXT: v_writelane_b32 v0, s5, 29 +; CHECK-NEXT: v_writelane_b32 v0, s6, 30 +; CHECK-NEXT: v_writelane_b32 v0, s7, 31 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[0:15] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v0, s0, 32 +; CHECK-NEXT: v_writelane_b32 v0, s1, 33 +; CHECK-NEXT: v_writelane_b32 v0, s2, 34 +; CHECK-NEXT: v_writelane_b32 v0, s3, 35 +; CHECK-NEXT: v_writelane_b32 v0, s4, 36 +; CHECK-NEXT: v_writelane_b32 v0, s5, 37 +; CHECK-NEXT: v_writelane_b32 v0, s6, 38 +; CHECK-NEXT: v_writelane_b32 v0, s7, 39 +; CHECK-NEXT: v_writelane_b32 v0, s8, 40 +; CHECK-NEXT: v_writelane_b32 v0, s9, 41 +; CHECK-NEXT: v_writelane_b32 v0, s10, 42 +; CHECK-NEXT: v_writelane_b32 v0, s11, 43 +; CHECK-NEXT: v_writelane_b32 v0, s12, 44 +; CHECK-NEXT: v_writelane_b32 v0, s13, 45 +; CHECK-NEXT: v_writelane_b32 v0, s14, 46 +; CHECK-NEXT: v_writelane_b32 v0, s15, 47 +; CHECK-NEXT: s_cbranch_scc0 .LBB0_2 +; CHECK-NEXT: ; %bb.1: ; %ret +; CHECK-NEXT: ; kill: killed $vgpr23 +; CHECK-NEXT: ; kill: killed $vgpr0 +; CHECK-NEXT: s_endpgm +; CHECK-NEXT: .LBB0_2: ; %bb0 +; CHECK-NEXT: v_readlane_b32 s0, v23, 0 +; CHECK-NEXT: v_readlane_b32 s1, v23, 1 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:1] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s0, v23, 2 +; CHECK-NEXT: v_readlane_b32 s1, v23, 3 +; CHECK-NEXT: v_readlane_b32 s2, v23, 4 +; CHECK-NEXT: v_readlane_b32 s3, v23, 5 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:3] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s0, v23, 6 +; CHECK-NEXT: v_readlane_b32 s1, v23, 7 +; CHECK-NEXT: v_readlane_b32 s2, v23, 8 +; CHECK-NEXT: v_readlane_b32 s3, v23, 9 +; CHECK-NEXT: v_readlane_b32 s4, v23, 10 +; CHECK-NEXT: v_readlane_b32 s5, v23, 11 +; CHECK-NEXT: v_readlane_b32 s6, v23, 12 +; CHECK-NEXT: v_readlane_b32 s7, v23, 13 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:7] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s0, v23, 14 +; CHECK-NEXT: v_readlane_b32 s1, v23, 15 +; CHECK-NEXT: v_readlane_b32 s2, v23, 16 +; CHECK-NEXT: v_readlane_b32 s3, v23, 17 +; CHECK-NEXT: v_readlane_b32 s4, v23, 18 +; CHECK-NEXT: v_readlane_b32 s5, v23, 19 +; CHECK-NEXT: v_readlane_b32 s6, v23, 20 +; CHECK-NEXT: v_readlane_b32 s7, v23, 21 +; CHECK-NEXT: v_readlane_b32 s8, v23, 22 +; CHECK-NEXT: v_readlane_b32 s9, v23, 23 +; CHECK-NEXT: v_readlane_b32 s10, v23, 24 +; CHECK-NEXT: v_readlane_b32 s11, v23, 25 +; CHECK-NEXT: v_readlane_b32 s12, v23, 26 +; CHECK-NEXT: v_readlane_b32 s13, v23, 27 +; CHECK-NEXT: v_readlane_b32 s14, v23, 28 +; CHECK-NEXT: v_readlane_b32 s15, v23, 29 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:15] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s0, v23, 30 +; CHECK-NEXT: v_readlane_b32 s1, v23, 31 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:1] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s0, v23, 32 +; CHECK-NEXT: v_readlane_b32 s1, v23, 33 +; CHECK-NEXT: v_readlane_b32 s2, v23, 34 +; CHECK-NEXT: v_readlane_b32 s3, v23, 35 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:3] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s0, v23, 36 +; CHECK-NEXT: v_readlane_b32 s1, v23, 37 +; CHECK-NEXT: v_readlane_b32 s2, v23, 38 +; CHECK-NEXT: v_readlane_b32 s3, v23, 39 +; CHECK-NEXT: v_readlane_b32 s4, v23, 40 +; CHECK-NEXT: v_readlane_b32 s5, v23, 41 +; CHECK-NEXT: v_readlane_b32 s6, v23, 42 +; CHECK-NEXT: v_readlane_b32 s7, v23, 43 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:7] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s0, v23, 44 +; CHECK-NEXT: v_readlane_b32 s1, v23, 45 +; CHECK-NEXT: v_readlane_b32 s2, v23, 46 +; CHECK-NEXT: v_readlane_b32 s3, v23, 47 +; CHECK-NEXT: v_readlane_b32 s4, v23, 48 +; CHECK-NEXT: v_readlane_b32 s5, v23, 49 +; CHECK-NEXT: v_readlane_b32 s6, v23, 50 +; CHECK-NEXT: v_readlane_b32 s7, v23, 51 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[16:31] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[52:53] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[48:51] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[36:43] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s8, v23, 52 +; CHECK-NEXT: v_readlane_b32 s9, v23, 53 +; CHECK-NEXT: v_readlane_b32 s10, v23, 54 +; CHECK-NEXT: v_readlane_b32 s11, v23, 55 +; CHECK-NEXT: v_readlane_b32 s12, v23, 56 +; CHECK-NEXT: v_readlane_b32 s13, v23, 57 +; CHECK-NEXT: v_readlane_b32 s14, v23, 58 +; CHECK-NEXT: v_readlane_b32 s15, v23, 59 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:15] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s0, v23, 60 +; CHECK-NEXT: v_readlane_b32 s1, v23, 61 +; CHECK-NEXT: v_readlane_b32 s2, v23, 62 +; CHECK-NEXT: v_readlane_b32 s3, v23, 63 +; CHECK-NEXT: v_readlane_b32 s4, v0, 0 +; CHECK-NEXT: v_readlane_b32 s5, v0, 1 +; CHECK-NEXT: v_readlane_b32 s6, v0, 2 +; CHECK-NEXT: v_readlane_b32 s7, v0, 3 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[34:35] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[44:47] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:7] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s0, v0, 4 +; CHECK-NEXT: v_readlane_b32 s1, v0, 5 +; CHECK-NEXT: v_readlane_b32 s2, v0, 6 +; CHECK-NEXT: v_readlane_b32 s3, v0, 7 +; CHECK-NEXT: v_readlane_b32 s4, v0, 8 +; CHECK-NEXT: v_readlane_b32 s5, v0, 9 +; CHECK-NEXT: v_readlane_b32 s6, v0, 10 +; CHECK-NEXT: v_readlane_b32 s7, v0, 11 +; CHECK-NEXT: v_readlane_b32 s8, v0, 12 +; CHECK-NEXT: v_readlane_b32 s9, v0, 13 +; CHECK-NEXT: v_readlane_b32 s10, v0, 14 +; CHECK-NEXT: v_readlane_b32 s11, v0, 15 +; CHECK-NEXT: v_readlane_b32 s12, v0, 16 +; CHECK-NEXT: v_readlane_b32 s13, v0, 17 +; CHECK-NEXT: v_readlane_b32 s14, v0, 18 +; CHECK-NEXT: v_readlane_b32 s15, v0, 19 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:15] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s0, v0, 20 +; CHECK-NEXT: v_readlane_b32 s1, v0, 21 +; CHECK-NEXT: v_readlane_b32 s2, v0, 22 +; CHECK-NEXT: v_readlane_b32 s3, v0, 23 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[54:55] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:3] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s0, v0, 24 +; CHECK-NEXT: v_readlane_b32 s1, v0, 25 +; CHECK-NEXT: v_readlane_b32 s2, v0, 26 +; CHECK-NEXT: v_readlane_b32 s3, v0, 27 +; CHECK-NEXT: v_readlane_b32 s4, v0, 28 +; CHECK-NEXT: v_readlane_b32 s5, v0, 29 +; CHECK-NEXT: v_readlane_b32 s6, v0, 30 +; CHECK-NEXT: v_readlane_b32 s7, v0, 31 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:7] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s0, v0, 32 +; CHECK-NEXT: v_readlane_b32 s1, v0, 33 +; CHECK-NEXT: v_readlane_b32 s2, v0, 34 +; CHECK-NEXT: v_readlane_b32 s3, v0, 35 +; CHECK-NEXT: v_readlane_b32 s4, v0, 36 +; CHECK-NEXT: v_readlane_b32 s5, v0, 37 +; CHECK-NEXT: v_readlane_b32 s6, v0, 38 +; CHECK-NEXT: v_readlane_b32 s7, v0, 39 +; CHECK-NEXT: v_readlane_b32 s8, v0, 40 +; CHECK-NEXT: v_readlane_b32 s9, v0, 41 +; CHECK-NEXT: v_readlane_b32 s10, v0, 42 +; CHECK-NEXT: v_readlane_b32 s11, v0, 43 +; CHECK-NEXT: v_readlane_b32 s12, v0, 44 +; CHECK-NEXT: v_readlane_b32 s13, v0, 45 +; CHECK-NEXT: v_readlane_b32 s14, v0, 46 +; CHECK-NEXT: v_readlane_b32 s15, v0, 47 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:15] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ; kill: killed $vgpr23 +; CHECK-NEXT: ; kill: killed $vgpr0 +; CHECK-NEXT: s_endpgm call void asm sideeffect "", "~{v[0:7]}" () #0 call void asm sideeffect "", "~{v[8:15]}" () #0 call void asm sideeffect "", "~{v[16:19]}"() #0 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-dead-frame-in-dbg-value.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-dead-frame-in-dbg-value.mir --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-dead-frame-in-dbg-value.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-dead-frame-in-dbg-value.mir @@ -1,4 +1,5 @@ -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=true -verify-machineinstrs -run-pass=si-lower-sgpr-spills,prologepilog -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=true -verify-machineinstrs -run-pass=si-lower-sgpr-spills -o - %s | FileCheck -check-prefix=SGPR_SPILL %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=true -verify-machineinstrs --start-before=si-lower-sgpr-spills --stop-after=prologepilog -o - %s | FileCheck -check-prefix=PEI %s # After handling the SGPR spill to VGPR in SILowerSGPRSpills pass, replace the dead frame index in the DBG_VALUE instruction with reg 0. # Otherwise, the test would crash during PEI while trying to replace the dead frame index. @@ -39,13 +40,33 @@ workGroupIDX: { reg: '$sgpr8' } privateSegmentWaveByteOffset: { reg: '$sgpr9' } body: | - ; CHECK-LABEL: name: test - ; CHECK: bb.0: - ; CHECK: $vgpr0 = V_WRITELANE_B32 killed $sgpr10, 0, $vgpr0 - ; CHECK: DBG_VALUE $noreg, 0 - ; CHECK: bb.1: - ; CHECK: $sgpr10 = V_READLANE_B32 $vgpr0, 0 - ; CHECK: S_ENDPGM 0 + ; SGPR_SPILL-LABEL: name: test + ; SGPR_SPILL: bb.0: + ; SGPR_SPILL-NEXT: successors: %bb.1(0x80000000) + ; SGPR_SPILL-NEXT: {{ $}} + ; SGPR_SPILL-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; SGPR_SPILL-NEXT: renamable $sgpr10 = IMPLICIT_DEF + ; SGPR_SPILL-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr10, 0, [[V_WRITELANE_B32_]] + ; SGPR_SPILL-NEXT: DBG_VALUE $noreg, 0 + ; SGPR_SPILL-NEXT: {{ $}} + ; SGPR_SPILL-NEXT: bb.1: + ; SGPR_SPILL-NEXT: $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_]], 0 + ; SGPR_SPILL-NEXT: KILL [[V_WRITELANE_B32_]] + ; SGPR_SPILL-NEXT: S_ENDPGM 0 + ; PEI-LABEL: name: test + ; PEI: bb.0: + ; PEI-NEXT: successors: %bb.1(0x80000000) + ; PEI-NEXT: {{ $}} + ; PEI-NEXT: renamable $vgpr0 = IMPLICIT_DEF + ; PEI-NEXT: renamable $sgpr10 = IMPLICIT_DEF + ; PEI-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr10, 0, killed $vgpr0 + ; PEI-NEXT: {{ $}} + ; PEI-NEXT: bb.1: + ; PEI-NEXT: liveins: $vgpr0 + ; PEI-NEXT: {{ $}} + ; PEI-NEXT: $sgpr10 = V_READLANE_B32 $vgpr0, 0 + ; PEI-NEXT: KILL killed renamable $vgpr0 + ; PEI-NEXT: S_ENDPGM 0 bb.0: renamable $sgpr10 = IMPLICIT_DEF SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-fi-skip-processing-stack-arg-dbg-value.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-fi-skip-processing-stack-arg-dbg-value.mir --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-fi-skip-processing-stack-arg-dbg-value.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-fi-skip-processing-stack-arg-dbg-value.mir @@ -1,4 +1,4 @@ -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=true -verify-machineinstrs -run-pass=si-lower-sgpr-spills,prologepilog -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=true -verify-machineinstrs -run-pass=si-lower-sgpr-spills -o - %s | FileCheck %s # After handling the SGPR spill to VGPR in SILowerSGPRSpills pass, we replace the dead frame index in the DBG_VALUE instruction with reg 0. # Skip looking for frame indices in the debug value instruction for incoming arguments passed via stack. The test would crash otherwise. @@ -45,7 +45,7 @@ body: | ; CHECK-LABEL: name: test ; CHECK: bb.0: - ; CHECK: DBG_VALUE $noreg, 0 + ; CHECK: DBG_VALUE bb.0: renamable $sgpr10 = IMPLICIT_DEF SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll @@ -9,9 +9,17 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_add_u32 s0, s0, s15 ; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: s_load_dword s4, s[8:9], 0x2 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[24:25] +; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[24:25] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: ;;#ASMSTART @@ -23,179 +31,179 @@ ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[8:23] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v23, s8, 0 -; GCN-NEXT: v_writelane_b32 v23, s9, 1 -; GCN-NEXT: v_writelane_b32 v23, s10, 2 -; GCN-NEXT: v_writelane_b32 v23, s11, 3 -; GCN-NEXT: v_writelane_b32 v23, s12, 4 -; GCN-NEXT: v_writelane_b32 v23, s13, 5 -; GCN-NEXT: v_writelane_b32 v23, s14, 6 -; GCN-NEXT: v_writelane_b32 v23, s15, 7 -; GCN-NEXT: v_writelane_b32 v23, s16, 8 -; GCN-NEXT: v_writelane_b32 v23, s17, 9 -; GCN-NEXT: v_writelane_b32 v23, s18, 10 -; GCN-NEXT: v_writelane_b32 v23, s19, 11 -; GCN-NEXT: v_writelane_b32 v23, s20, 12 -; GCN-NEXT: v_writelane_b32 v23, s21, 13 -; GCN-NEXT: v_writelane_b32 v23, s22, 14 -; GCN-NEXT: v_writelane_b32 v23, s23, 15 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_writelane_b32 v1, s8, 0 +; GCN-NEXT: v_writelane_b32 v1, s9, 1 +; GCN-NEXT: v_writelane_b32 v1, s10, 2 +; GCN-NEXT: v_writelane_b32 v1, s11, 3 +; GCN-NEXT: v_writelane_b32 v1, s12, 4 +; GCN-NEXT: v_writelane_b32 v1, s13, 5 +; GCN-NEXT: v_writelane_b32 v1, s14, 6 +; GCN-NEXT: v_writelane_b32 v1, s15, 7 +; GCN-NEXT: v_writelane_b32 v1, s16, 8 +; GCN-NEXT: v_writelane_b32 v1, s17, 9 +; GCN-NEXT: v_writelane_b32 v1, s18, 10 +; GCN-NEXT: v_writelane_b32 v1, s19, 11 +; GCN-NEXT: v_writelane_b32 v1, s20, 12 +; GCN-NEXT: v_writelane_b32 v1, s21, 13 +; GCN-NEXT: v_writelane_b32 v1, s22, 14 +; GCN-NEXT: v_writelane_b32 v1, s23, 15 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[8:23] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v23, s8, 16 -; GCN-NEXT: v_writelane_b32 v23, s9, 17 -; GCN-NEXT: v_writelane_b32 v23, s10, 18 -; GCN-NEXT: v_writelane_b32 v23, s11, 19 -; GCN-NEXT: v_writelane_b32 v23, s12, 20 -; GCN-NEXT: v_writelane_b32 v23, s13, 21 -; GCN-NEXT: v_writelane_b32 v23, s14, 22 -; GCN-NEXT: v_writelane_b32 v23, s15, 23 -; GCN-NEXT: v_writelane_b32 v23, s16, 24 -; GCN-NEXT: v_writelane_b32 v23, s17, 25 -; GCN-NEXT: v_writelane_b32 v23, s18, 26 -; GCN-NEXT: v_writelane_b32 v23, s19, 27 -; GCN-NEXT: v_writelane_b32 v23, s20, 28 -; GCN-NEXT: v_writelane_b32 v23, s21, 29 -; GCN-NEXT: v_writelane_b32 v23, s22, 30 -; GCN-NEXT: v_writelane_b32 v23, s23, 31 +; GCN-NEXT: v_writelane_b32 v1, s8, 16 +; GCN-NEXT: v_writelane_b32 v1, s9, 17 +; GCN-NEXT: v_writelane_b32 v1, s10, 18 +; GCN-NEXT: v_writelane_b32 v1, s11, 19 +; GCN-NEXT: v_writelane_b32 v1, s12, 20 +; GCN-NEXT: v_writelane_b32 v1, s13, 21 +; GCN-NEXT: v_writelane_b32 v1, s14, 22 +; GCN-NEXT: v_writelane_b32 v1, s15, 23 +; GCN-NEXT: v_writelane_b32 v1, s16, 24 +; GCN-NEXT: v_writelane_b32 v1, s17, 25 +; GCN-NEXT: v_writelane_b32 v1, s18, 26 +; GCN-NEXT: v_writelane_b32 v1, s19, 27 +; GCN-NEXT: v_writelane_b32 v1, s20, 28 +; GCN-NEXT: v_writelane_b32 v1, s21, 29 +; GCN-NEXT: v_writelane_b32 v1, s22, 30 +; GCN-NEXT: v_writelane_b32 v1, s23, 31 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[8:23] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v23, s8, 32 -; GCN-NEXT: v_writelane_b32 v23, s9, 33 -; GCN-NEXT: v_writelane_b32 v23, s10, 34 -; GCN-NEXT: v_writelane_b32 v23, s11, 35 -; GCN-NEXT: v_writelane_b32 v23, s12, 36 -; GCN-NEXT: v_writelane_b32 v23, s13, 37 -; GCN-NEXT: v_writelane_b32 v23, s14, 38 -; GCN-NEXT: v_writelane_b32 v23, s15, 39 -; GCN-NEXT: v_writelane_b32 v23, s16, 40 -; GCN-NEXT: v_writelane_b32 v23, s17, 41 -; GCN-NEXT: v_writelane_b32 v23, s18, 42 -; GCN-NEXT: v_writelane_b32 v23, s19, 43 -; GCN-NEXT: v_writelane_b32 v23, s20, 44 -; GCN-NEXT: v_writelane_b32 v23, s21, 45 -; GCN-NEXT: v_writelane_b32 v23, s22, 46 -; GCN-NEXT: v_writelane_b32 v23, s23, 47 +; GCN-NEXT: v_writelane_b32 v1, s8, 32 +; GCN-NEXT: v_writelane_b32 v1, s9, 33 +; GCN-NEXT: v_writelane_b32 v1, s10, 34 +; GCN-NEXT: v_writelane_b32 v1, s11, 35 +; GCN-NEXT: v_writelane_b32 v1, s12, 36 +; GCN-NEXT: v_writelane_b32 v1, s13, 37 +; GCN-NEXT: v_writelane_b32 v1, s14, 38 +; GCN-NEXT: v_writelane_b32 v1, s15, 39 +; GCN-NEXT: v_writelane_b32 v1, s16, 40 +; GCN-NEXT: v_writelane_b32 v1, s17, 41 +; GCN-NEXT: v_writelane_b32 v1, s18, 42 +; GCN-NEXT: v_writelane_b32 v1, s19, 43 +; GCN-NEXT: v_writelane_b32 v1, s20, 44 +; GCN-NEXT: v_writelane_b32 v1, s21, 45 +; GCN-NEXT: v_writelane_b32 v1, s22, 46 +; GCN-NEXT: v_writelane_b32 v1, s23, 47 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[8:23] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v23, s8, 48 -; GCN-NEXT: v_writelane_b32 v23, s9, 49 -; GCN-NEXT: v_writelane_b32 v23, s10, 50 -; GCN-NEXT: v_writelane_b32 v23, s11, 51 -; GCN-NEXT: v_writelane_b32 v23, s12, 52 -; GCN-NEXT: v_writelane_b32 v23, s13, 53 -; GCN-NEXT: v_writelane_b32 v23, s14, 54 -; GCN-NEXT: v_writelane_b32 v23, s15, 55 -; GCN-NEXT: v_writelane_b32 v23, s16, 56 -; GCN-NEXT: v_writelane_b32 v23, s17, 57 -; GCN-NEXT: v_writelane_b32 v23, s18, 58 -; GCN-NEXT: v_writelane_b32 v23, s19, 59 -; GCN-NEXT: v_writelane_b32 v23, s20, 60 -; GCN-NEXT: v_writelane_b32 v23, s21, 61 -; GCN-NEXT: v_writelane_b32 v23, s22, 62 -; GCN-NEXT: v_writelane_b32 v23, s23, 63 +; GCN-NEXT: v_writelane_b32 v1, s8, 48 +; GCN-NEXT: v_writelane_b32 v1, s9, 49 +; GCN-NEXT: v_writelane_b32 v1, s10, 50 +; GCN-NEXT: v_writelane_b32 v1, s11, 51 +; GCN-NEXT: v_writelane_b32 v1, s12, 52 +; GCN-NEXT: v_writelane_b32 v1, s13, 53 +; GCN-NEXT: v_writelane_b32 v1, s14, 54 +; GCN-NEXT: v_writelane_b32 v1, s15, 55 +; GCN-NEXT: v_writelane_b32 v1, s16, 56 +; GCN-NEXT: v_writelane_b32 v1, s17, 57 +; GCN-NEXT: v_writelane_b32 v1, s18, 58 +; GCN-NEXT: v_writelane_b32 v1, s19, 59 +; GCN-NEXT: v_writelane_b32 v1, s20, 60 +; GCN-NEXT: v_writelane_b32 v1, s21, 61 +; GCN-NEXT: v_writelane_b32 v1, s22, 62 +; GCN-NEXT: v_writelane_b32 v1, s23, 63 +; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[24:25] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[6:7] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_mov_b64 s[8:9], exec -; GCN-NEXT: s_mov_b64 exec, 3 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_writelane_b32 v0, s6, 0 ; GCN-NEXT: v_writelane_b32 v0, s7, 1 +; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: s_mov_b64 exec, s[24:25] ; GCN-NEXT: s_mov_b32 s5, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s4, s5 ; GCN-NEXT: s_cbranch_scc1 .LBB0_2 ; GCN-NEXT: ; %bb.1: ; %bb0 -; GCN-NEXT: v_readlane_b32 s4, v23, 0 -; GCN-NEXT: v_readlane_b32 s5, v23, 1 -; GCN-NEXT: v_readlane_b32 s6, v23, 2 -; GCN-NEXT: v_readlane_b32 s7, v23, 3 -; GCN-NEXT: v_readlane_b32 s8, v23, 4 -; GCN-NEXT: v_readlane_b32 s9, v23, 5 -; GCN-NEXT: v_readlane_b32 s10, v23, 6 -; GCN-NEXT: v_readlane_b32 s11, v23, 7 -; GCN-NEXT: v_readlane_b32 s12, v23, 8 -; GCN-NEXT: v_readlane_b32 s13, v23, 9 -; GCN-NEXT: v_readlane_b32 s14, v23, 10 -; GCN-NEXT: v_readlane_b32 s15, v23, 11 -; GCN-NEXT: v_readlane_b32 s16, v23, 12 -; GCN-NEXT: v_readlane_b32 s17, v23, 13 -; GCN-NEXT: v_readlane_b32 s18, v23, 14 -; GCN-NEXT: v_readlane_b32 s19, v23, 15 +; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[24:25] +; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[24:25] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readlane_b32 s4, v1, 0 +; GCN-NEXT: v_readlane_b32 s5, v1, 1 +; GCN-NEXT: v_readlane_b32 s6, v1, 2 +; GCN-NEXT: v_readlane_b32 s7, v1, 3 +; GCN-NEXT: v_readlane_b32 s8, v1, 4 +; GCN-NEXT: v_readlane_b32 s9, v1, 5 +; GCN-NEXT: v_readlane_b32 s10, v1, 6 +; GCN-NEXT: v_readlane_b32 s11, v1, 7 +; GCN-NEXT: v_readlane_b32 s12, v1, 8 +; GCN-NEXT: v_readlane_b32 s13, v1, 9 +; GCN-NEXT: v_readlane_b32 s14, v1, 10 +; GCN-NEXT: v_readlane_b32 s15, v1, 11 +; GCN-NEXT: v_readlane_b32 s16, v1, 12 +; GCN-NEXT: v_readlane_b32 s17, v1, 13 +; GCN-NEXT: v_readlane_b32 s18, v1, 14 +; GCN-NEXT: v_readlane_b32 s19, v1, 15 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s4, v23, 16 -; GCN-NEXT: v_readlane_b32 s5, v23, 17 -; GCN-NEXT: v_readlane_b32 s6, v23, 18 -; GCN-NEXT: v_readlane_b32 s7, v23, 19 -; GCN-NEXT: v_readlane_b32 s8, v23, 20 -; GCN-NEXT: v_readlane_b32 s9, v23, 21 -; GCN-NEXT: v_readlane_b32 s10, v23, 22 -; GCN-NEXT: v_readlane_b32 s11, v23, 23 -; GCN-NEXT: v_readlane_b32 s12, v23, 24 -; GCN-NEXT: v_readlane_b32 s13, v23, 25 -; GCN-NEXT: v_readlane_b32 s14, v23, 26 -; GCN-NEXT: v_readlane_b32 s15, v23, 27 -; GCN-NEXT: v_readlane_b32 s16, v23, 28 -; GCN-NEXT: v_readlane_b32 s17, v23, 29 -; GCN-NEXT: v_readlane_b32 s18, v23, 30 -; GCN-NEXT: v_readlane_b32 s19, v23, 31 +; GCN-NEXT: v_readlane_b32 s4, v1, 16 +; GCN-NEXT: v_readlane_b32 s5, v1, 17 +; GCN-NEXT: v_readlane_b32 s6, v1, 18 +; GCN-NEXT: v_readlane_b32 s7, v1, 19 +; GCN-NEXT: v_readlane_b32 s8, v1, 20 +; GCN-NEXT: v_readlane_b32 s9, v1, 21 +; GCN-NEXT: v_readlane_b32 s10, v1, 22 +; GCN-NEXT: v_readlane_b32 s11, v1, 23 +; GCN-NEXT: v_readlane_b32 s12, v1, 24 +; GCN-NEXT: v_readlane_b32 s13, v1, 25 +; GCN-NEXT: v_readlane_b32 s14, v1, 26 +; GCN-NEXT: v_readlane_b32 s15, v1, 27 +; GCN-NEXT: v_readlane_b32 s16, v1, 28 +; GCN-NEXT: v_readlane_b32 s17, v1, 29 +; GCN-NEXT: v_readlane_b32 s18, v1, 30 +; GCN-NEXT: v_readlane_b32 s19, v1, 31 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s4, v23, 32 -; GCN-NEXT: v_readlane_b32 s5, v23, 33 -; GCN-NEXT: v_readlane_b32 s6, v23, 34 -; GCN-NEXT: v_readlane_b32 s7, v23, 35 -; GCN-NEXT: v_readlane_b32 s8, v23, 36 -; GCN-NEXT: v_readlane_b32 s9, v23, 37 -; GCN-NEXT: v_readlane_b32 s10, v23, 38 -; GCN-NEXT: v_readlane_b32 s11, v23, 39 -; GCN-NEXT: v_readlane_b32 s12, v23, 40 -; GCN-NEXT: v_readlane_b32 s13, v23, 41 -; GCN-NEXT: v_readlane_b32 s14, v23, 42 -; GCN-NEXT: v_readlane_b32 s15, v23, 43 -; GCN-NEXT: v_readlane_b32 s16, v23, 44 -; GCN-NEXT: v_readlane_b32 s17, v23, 45 -; GCN-NEXT: v_readlane_b32 s18, v23, 46 -; GCN-NEXT: v_readlane_b32 s19, v23, 47 +; GCN-NEXT: v_readlane_b32 s4, v1, 32 +; GCN-NEXT: v_readlane_b32 s5, v1, 33 +; GCN-NEXT: v_readlane_b32 s6, v1, 34 +; GCN-NEXT: v_readlane_b32 s7, v1, 35 +; GCN-NEXT: v_readlane_b32 s8, v1, 36 +; GCN-NEXT: v_readlane_b32 s9, v1, 37 +; GCN-NEXT: v_readlane_b32 s10, v1, 38 +; GCN-NEXT: v_readlane_b32 s11, v1, 39 +; GCN-NEXT: v_readlane_b32 s12, v1, 40 +; GCN-NEXT: v_readlane_b32 s13, v1, 41 +; GCN-NEXT: v_readlane_b32 s14, v1, 42 +; GCN-NEXT: v_readlane_b32 s15, v1, 43 +; GCN-NEXT: v_readlane_b32 s16, v1, 44 +; GCN-NEXT: v_readlane_b32 s17, v1, 45 +; GCN-NEXT: v_readlane_b32 s18, v1, 46 +; GCN-NEXT: v_readlane_b32 s19, v1, 47 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s8, v23, 48 -; GCN-NEXT: v_readlane_b32 s9, v23, 49 -; GCN-NEXT: v_readlane_b32 s10, v23, 50 -; GCN-NEXT: v_readlane_b32 s11, v23, 51 -; GCN-NEXT: v_readlane_b32 s12, v23, 52 -; GCN-NEXT: v_readlane_b32 s13, v23, 53 -; GCN-NEXT: v_readlane_b32 s14, v23, 54 -; GCN-NEXT: v_readlane_b32 s15, v23, 55 -; GCN-NEXT: v_readlane_b32 s16, v23, 56 -; GCN-NEXT: v_readlane_b32 s17, v23, 57 -; GCN-NEXT: v_readlane_b32 s18, v23, 58 -; GCN-NEXT: v_readlane_b32 s19, v23, 59 -; GCN-NEXT: v_readlane_b32 s20, v23, 60 -; GCN-NEXT: v_readlane_b32 s21, v23, 61 -; GCN-NEXT: v_readlane_b32 s22, v23, 62 -; GCN-NEXT: v_readlane_b32 s23, v23, 63 -; GCN-NEXT: s_mov_b64 s[6:7], exec -; GCN-NEXT: s_mov_b64 exec, 3 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readlane_b32 s8, v1, 48 +; GCN-NEXT: v_readlane_b32 s9, v1, 49 +; GCN-NEXT: v_readlane_b32 s10, v1, 50 +; GCN-NEXT: v_readlane_b32 s11, v1, 51 +; GCN-NEXT: v_readlane_b32 s12, v1, 52 +; GCN-NEXT: v_readlane_b32 s13, v1, 53 +; GCN-NEXT: v_readlane_b32 s14, v1, 54 +; GCN-NEXT: v_readlane_b32 s15, v1, 55 +; GCN-NEXT: v_readlane_b32 s16, v1, 56 +; GCN-NEXT: v_readlane_b32 s17, v1, 57 +; GCN-NEXT: v_readlane_b32 s18, v1, 58 +; GCN-NEXT: v_readlane_b32 s19, v1, 59 +; GCN-NEXT: v_readlane_b32 s20, v1, 60 +; GCN-NEXT: v_readlane_b32 s21, v1, 61 +; GCN-NEXT: v_readlane_b32 s22, v1, 62 +; GCN-NEXT: v_readlane_b32 s23, v1, 63 ; GCN-NEXT: v_readlane_b32 s4, v0, 0 ; GCN-NEXT: v_readlane_b32 s5, v0, 1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[8:23] ; GCN-NEXT: ;;#ASMEND @@ -203,6 +211,14 @@ ; GCN-NEXT: ; use s[4:5] ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: .LBB0_2: ; %ret +; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[24:25] +; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[24:25] +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr0 ; GCN-NEXT: s_endpgm call void asm sideeffect "", "~{v[0:7]}" () #0 call void asm sideeffect "", "~{v[8:15]}" () #0 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-partially-undef.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-partially-undef.mir --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-partially-undef.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-partially-undef.mir @@ -20,10 +20,11 @@ liveins: $sgpr4 ; CHECK-LABEL: name: sgpr_spill_s64_undef_high32 - ; CHECK: liveins: $sgpr4, $vgpr0 + ; CHECK: liveins: $sgpr4 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, $vgpr0, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5 - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr5, 1, $vgpr0, implicit $sgpr4_sgpr5 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5 + ; CHECK-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5 SI_SPILL_S64_SAVE renamable $sgpr4_sgpr5, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store (s64) into %stack.0, align 4, addrspace 5) ... @@ -45,10 +46,11 @@ liveins: $sgpr5 ; CHECK-LABEL: name: sgpr_spill_s64_undef_low32 - ; CHECK: liveins: $sgpr5, $vgpr0 + ; CHECK: liveins: $sgpr5 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, $vgpr0, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5 - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr5, 1, $vgpr0, implicit $sgpr4_sgpr5 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5 + ; CHECK-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5 SI_SPILL_S64_SAVE renamable $sgpr4_sgpr5, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store (s64) into %stack.0, align 4, addrspace 5) ... diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-update-only-slot-indexes.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spill-update-only-slot-indexes.ll --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-update-only-slot-indexes.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-update-only-slot-indexes.ll @@ -13,32 +13,40 @@ ; GCN-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GCN-NEXT: s_mov_b32 s38, -1 ; GCN-NEXT: s_mov_b32 s39, 0xe00000 -; GCN-NEXT: v_writelane_b32 v40, s4, 0 +; GCN-NEXT: ; implicit-def: $vgpr3 ; GCN-NEXT: s_add_u32 s36, s36, s11 -; GCN-NEXT: v_writelane_b32 v40, s5, 1 +; GCN-NEXT: v_writelane_b32 v3, s4, 0 +; GCN-NEXT: s_movk_i32 s32, 0x400 ; GCN-NEXT: s_addc_u32 s37, s37, 0 -; GCN-NEXT: s_mov_b64 s[4:5], s[0:1] -; GCN-NEXT: v_readlane_b32 s0, v40, 0 +; GCN-NEXT: s_mov_b32 s14, s10 ; GCN-NEXT: s_mov_b32 s13, s9 ; GCN-NEXT: s_mov_b32 s12, s8 -; GCN-NEXT: v_readlane_b32 s1, v40, 1 +; GCN-NEXT: s_mov_b64 s[10:11], s[6:7] +; GCN-NEXT: v_writelane_b32 v3, s5, 1 +; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GCN-NEXT: buffer_store_dword v3, off, s[36:39], 0 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[34:35] +; GCN-NEXT: s_mov_b64 s[4:5], s[0:1] +; GCN-NEXT: v_readlane_b32 s0, v3, 0 +; GCN-NEXT: v_readlane_b32 s1, v3, 1 ; GCN-NEXT: s_add_u32 s8, s0, 36 ; GCN-NEXT: s_addc_u32 s9, s1, 0 ; GCN-NEXT: s_getpc_b64 s[0:1] ; GCN-NEXT: s_add_u32 s0, s0, foo@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s1, s1, foo@gotpcrel32@hi+12 ; GCN-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 -; GCN-NEXT: s_mov_b32 s14, s10 -; GCN-NEXT: s_mov_b64 s[10:11], s[6:7] ; GCN-NEXT: s_mov_b64 s[6:7], s[2:3] ; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GCN-NEXT: s_mov_b64 s[0:1], s[36:37] ; GCN-NEXT: v_or3_b32 v31, v0, v1, v2 ; GCN-NEXT: s_mov_b64 s[2:3], s[38:39] -; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GCN-NEXT: buffer_load_dword v0, off, s[36:39], 0 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[34:35] +; GCN-NEXT: ; kill: killed $vgpr0 ; GCN-NEXT: s_endpgm call void @foo() ret void diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-vmem-large-frame.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-vmem-large-frame.mir --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-vmem-large-frame.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-vmem-large-frame.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=false -verify-machineinstrs -run-pass=si-lower-sgpr-spills,prologepilog -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=false -verify-machineinstrs -start-before=si-lower-sgpr-spills -stop-after=prologepilog -o - %s | FileCheck %s # Check that we allocate 2 emergency stack slots if we're spilling # SGPRs to memory and potentially have an offset larger than fits in @@ -29,7 +29,7 @@ ; CHECK-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $exec ; CHECK-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr1 ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) - ; CHECK-NEXT: $vgpr1 = V_WRITELANE_B32 killed $sgpr10, 0, undef $vgpr1 + ; CHECK-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr10, 0, undef $vgpr1 ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) ; CHECK-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5, implicit killed $vgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll --- a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll @@ -16,10 +16,10 @@ ; GCN-LABEL: spill_sgpr_with_no_lower_vgpr_available: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s24, s33 +; GCN-NEXT: s_mov_b32 s18, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 -; GCN-NEXT: buffer_store_dword v255, off, s[0:3], s33 offset:452 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v255, off, s[0:3], s33 offset:448 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[16:17] ; GCN-NEXT: s_add_i32 s32, s32, 0x7400 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:440 ; 4-byte Folded Spill @@ -135,13 +135,13 @@ ; GCN-NEXT: buffer_store_dword v254, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: v_writelane_b32 v255, s30, 0 ; GCN-NEXT: v_writelane_b32 v255, s31, 1 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:448 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:452 ; 4-byte Folded Spill ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:444 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:448 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:452 ; 4-byte Folded Reload ; GCN-NEXT: s_getpc_b64 s[16:17] ; GCN-NEXT: s_add_u32 s16, s16, child_function@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s17, s17, child_function@gotpcrel32@hi+12 @@ -266,10 +266,10 @@ ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:436 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:440 ; 4-byte Folded Reload ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v255, off, s[0:3], s33 offset:452 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v255, off, s[0:3], s33 offset:448 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_add_i32 s32, s32, 0xffff8c00 -; GCN-NEXT: s_mov_b32 s33, s24 +; GCN-NEXT: s_mov_b32 s33, s18 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, align 4, addrspace(5) @@ -310,10 +310,10 @@ ; GCN-LABEL: spill_to_lowest_available_vgpr: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s24, s33 +; GCN-NEXT: s_mov_b32 s18, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 -; GCN-NEXT: buffer_store_dword v254, off, s[0:3], s33 offset:448 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v254, off, s[0:3], s33 offset:444 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[16:17] ; GCN-NEXT: s_add_i32 s32, s32, 0x7400 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:436 ; 4-byte Folded Spill @@ -428,13 +428,13 @@ ; GCN-NEXT: buffer_store_dword v253, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: v_writelane_b32 v254, s30, 0 ; GCN-NEXT: v_writelane_b32 v254, s31, 1 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:444 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:448 ; 4-byte Folded Spill ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:440 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:444 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:448 ; 4-byte Folded Reload ; GCN-NEXT: s_getpc_b64 s[16:17] ; GCN-NEXT: s_add_u32 s16, s16, child_function@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s17, s17, child_function@gotpcrel32@hi+12 @@ -558,10 +558,10 @@ ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:432 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:436 ; 4-byte Folded Reload ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v254, off, s[0:3], s33 offset:448 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v254, off, s[0:3], s33 offset:444 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_add_i32 s32, s32, 0xffff8c00 -; GCN-NEXT: s_mov_b32 s33, s24 +; GCN-NEXT: s_mov_b32 s33, s18 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, align 4, addrspace(5) @@ -602,8 +602,8 @@ ; GCN-LABEL: spill_sgpr_with_sgpr_uses: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v254, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill @@ -715,22 +715,38 @@ ; GCN-NEXT: buffer_store_dword v251, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v252, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v253, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:440 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[8:9] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s4 ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v254, s4, 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_writelane_b32 v0, s4, 0 +; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[8:9] ; GCN-NEXT: s_cbranch_scc1 .LBB3_2 ; GCN-NEXT: ; %bb.1: ; %bb0 -; GCN-NEXT: v_readlane_b32 s4, v254, 0 +; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readlane_b32 s4, v0, 0 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s4 ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: .LBB3_2: ; %ret +; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: ; kill: killed $vgpr0 ; GCN-NEXT: buffer_load_dword v253, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v252, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v251, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -841,8 +857,8 @@ ; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v254, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -1167,7 +1183,7 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill @@ -1299,17 +1315,17 @@ ; GCN-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GCN-NEXT: flat_load_dwordx4 v[5:8], v[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -1431,7 +1447,7 @@ ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -1512,7 +1528,7 @@ ; GCN-LABEL: spill_sgpr_no_free_vgpr_ipra: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s24, s33 +; GCN-NEXT: s_mov_b32 s18, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_add_i32 s32, s32, 0x7400 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:444 ; 4-byte Folded Spill @@ -1782,7 +1798,7 @@ ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:440 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:444 ; 4-byte Folded Reload ; GCN-NEXT: s_add_i32 s32, s32, 0xffff8c00 -; GCN-NEXT: s_mov_b32 s33, s24 +; GCN-NEXT: s_mov_b32 s33, s18 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] call void @child_function_ipra() diff --git a/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll b/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll --- a/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll +++ b/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll @@ -6,17 +6,17 @@ ; ALL: s_mov_b32 s[[HI:[0-9]+]], 0xe80000 ; Make sure we are handling hazards correctly. -; SGPR: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 +; SGPR: v_mov_b32_e32 v0, vcc_lo +; SGPR-NEXT: s_or_saveexec_b64 [[EXEC_COPY:s\[[0-9]+:[0-9]+\]]], -1 +; SGPR-NEXT: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 ; 4-byte Folded Reload +; SGPR-NEXT: s_mov_b64 exec, [[EXEC_COPY]] ; SGPR-NEXT: s_waitcnt vmcnt(0) ; SGPR-NEXT: v_readlane_b32 s{{[0-9]+}}, [[VHI]], 0 ; SGPR-NEXT: v_readlane_b32 s{{[0-9]+}}, [[VHI]], 1 ; SGPR-NEXT: v_readlane_b32 s{{[0-9]+}}, [[VHI]], 2 ; SGPR-NEXT: v_readlane_b32 s[[HI:[0-9]+]], [[VHI]], 3 -; SGPR-NEXT: buffer_load_dword [[VHI]], off, s[96:99], 0 -; SGPR-NEXT: s_waitcnt vmcnt(0) -; SGPR-NEXT: s_mov_b64 exec, s[4:5] -; SGPR-NEXT: s_nop 1 -; SGPR-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SGPR-NEXT: ; kill: killed $vgpr1 +; SGPR-NEXT: s_nop 4 ; ALL: s_endpgm define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) { diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll --- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll +++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll @@ -202,10 +202,9 @@ ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1 ; GCN-NEXT: buffer_store_dword [[CSRV:v[0-9]+]], off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword [[CSRV_1:v[0-9]+]], off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec -; GCN-DAG: s_addk_i32 s32, 0x800 -; GCN: v_writelane_b32 [[CSRV_1]], [[FP_SCRATCH_COPY]], 0 +; GCN-DAG: s_addk_i32 s32, 0x400 +; GCN: v_writelane_b32 [[CSRV]], [[FP_SCRATCH_COPY]], 2 ; GCN-DAG: s_getpc_b64 s[4:5] ; GCN-DAG: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 @@ -227,12 +226,11 @@ ; GCN-NEXT: s_addc_u32 s5, s5, sibling_call_i32_fastcc_i32_i32@rel32@hi+12 ; GCN-NEXT: v_readlane_b32 s31, [[CSRV]], 1 ; GCN-NEXT: v_readlane_b32 s30, [[CSRV]], 0 -; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[CSRV_1]], 0 +; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[CSRV]], 2 ; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-NEXT: buffer_load_dword [[CSRV]], off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword [[CSRV_1]], off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[8:9] -; GCN-NEXT: s_addk_i32 s32, 0xf800 +; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, [[FP_SCRATCH_COPY]] ; GCN-NEXT: s_setpc_b64 s[4:5] define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i32 %c) #1 { diff --git a/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir b/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir --- a/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir +++ b/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir @@ -33,6 +33,7 @@ ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $vgpr1 = IMPLICIT_DEF ; CHECK-NEXT: renamable $sgpr34_sgpr35 = IMPLICIT_DEF ; CHECK-NEXT: dead renamable $vgpr0 = IMPLICIT_DEF ; CHECK-NEXT: renamable $sgpr41 = IMPLICIT_DEF @@ -41,7 +42,15 @@ ; CHECK-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX8_IMM renamable $sgpr38_sgpr39, 0, 0 :: (dereferenceable invariant load (s256), align 16, addrspace 4) ; CHECK-NEXT: dead renamable $sgpr4 = S_LOAD_DWORD_IMM renamable $sgpr38_sgpr39, 48, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4) ; CHECK-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = S_LOAD_DWORDX8_IMM renamable $sgpr38_sgpr39, 56, 0 :: (dereferenceable invariant load (s256), align 8, addrspace 4) - ; CHECK-NEXT: SI_SPILL_S256_SAVE killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s256) into %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: renamable $vgpr1 = V_WRITELANE_B32 $sgpr4, 0, killed $vgpr1, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 + ; CHECK-NEXT: renamable $vgpr1 = V_WRITELANE_B32 $sgpr5, 1, killed $vgpr1 + ; CHECK-NEXT: renamable $vgpr1 = V_WRITELANE_B32 $sgpr6, 2, killed $vgpr1 + ; CHECK-NEXT: renamable $vgpr1 = V_WRITELANE_B32 $sgpr7, 3, killed $vgpr1 + ; CHECK-NEXT: renamable $vgpr1 = V_WRITELANE_B32 $sgpr8, 4, killed $vgpr1 + ; CHECK-NEXT: renamable $vgpr1 = V_WRITELANE_B32 $sgpr9, 5, killed $vgpr1 + ; CHECK-NEXT: renamable $vgpr1 = V_WRITELANE_B32 $sgpr10, 6, killed $vgpr1 + ; CHECK-NEXT: renamable $vgpr1 = V_WRITELANE_B32 killed $sgpr11, 7, killed $vgpr1, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 + ; CHECK-NEXT: SI_SPILL_WWM_V32_SAVE killed $vgpr1, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) ; CHECK-NEXT: dead renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM renamable $sgpr44_sgpr45, 0, 0 :: (invariant load (s64), align 16, addrspace 4) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; CHECK-NEXT: $vgpr1 = COPY renamable $sgpr51 @@ -54,30 +63,50 @@ ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = SI_SPILL_S256_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s256) from %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: renamable $vgpr1 = SI_SPILL_WWM_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) + ; CHECK-NEXT: $sgpr4 = V_READLANE_B32 $vgpr1, 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 + ; CHECK-NEXT: $sgpr5 = V_READLANE_B32 $vgpr1, 1 + ; CHECK-NEXT: $sgpr6 = V_READLANE_B32 $vgpr1, 2 + ; CHECK-NEXT: $sgpr7 = V_READLANE_B32 $vgpr1, 3 + ; CHECK-NEXT: $sgpr8 = V_READLANE_B32 $vgpr1, 4 + ; CHECK-NEXT: $sgpr9 = V_READLANE_B32 $vgpr1, 5 + ; CHECK-NEXT: $sgpr10 = V_READLANE_B32 $vgpr1, 6 + ; CHECK-NEXT: $sgpr11 = V_READLANE_B32 $vgpr1, 7 + ; CHECK-NEXT: $noreg = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: $exec = S_MOV_B64 killed $noreg ; CHECK-NEXT: S_BRANCH %bb.3 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = SI_SPILL_S256_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s256) from %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: renamable $vgpr1 = SI_SPILL_WWM_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) + ; CHECK-NEXT: $sgpr4 = V_READLANE_B32 $vgpr1, 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 + ; CHECK-NEXT: $sgpr5 = V_READLANE_B32 $vgpr1, 1 + ; CHECK-NEXT: $sgpr6 = V_READLANE_B32 $vgpr1, 2 + ; CHECK-NEXT: $sgpr7 = V_READLANE_B32 $vgpr1, 3 + ; CHECK-NEXT: $sgpr8 = V_READLANE_B32 $vgpr1, 4 + ; CHECK-NEXT: $sgpr9 = V_READLANE_B32 $vgpr1, 5 + ; CHECK-NEXT: $sgpr10 = V_READLANE_B32 $vgpr1, 6 + ; CHECK-NEXT: $sgpr11 = V_READLANE_B32 $vgpr1, 7 ; CHECK-NEXT: S_CMP_LG_U64 renamable $sgpr4_sgpr5, 0, implicit-def $scc + ; CHECK-NEXT: $noreg = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: $exec = S_MOV_B64 killed $noreg ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.5(0x40000000), %bb.4(0x40000000) - ; CHECK-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11:0x00000000000003F0, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00 + ; CHECK-NEXT: liveins: $vgpr1, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11:0x00000000000003F0, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: S_CBRANCH_VCCZ %bb.5, implicit undef $vcc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) - ; CHECK-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11:0x00000000000003F0, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00 + ; CHECK-NEXT: liveins: $vgpr1, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11:0x00000000000003F0, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: S_CMP_EQ_U32 renamable $sgpr8, 0, implicit-def $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: - ; CHECK-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11:0x00000000000000F0, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00 + ; CHECK-NEXT: liveins: $vgpr1, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11:0x00000000000000F0, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: dead renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr38_sgpr39, 40, 0 :: (dereferenceable invariant load (s64), addrspace 4) ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR undef renamable $vgpr0, undef renamable $vgpr0, killed renamable $sgpr6_sgpr7, 0, 0, implicit $exec :: (store (s32), addrspace 1) @@ -87,6 +116,7 @@ ; CHECK-NEXT: $sgpr6_sgpr7 = COPY killed renamable $sgpr36_sgpr37 ; CHECK-NEXT: $sgpr10_sgpr11 = COPY killed renamable $sgpr34_sgpr35 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 + ; CHECK-NEXT: KILL killed renamable $vgpr1 ; CHECK-NEXT: S_ENDPGM 0 bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr14, $sgpr15, $sgpr16 diff --git a/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll b/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll --- a/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll @@ -2,20 +2,22 @@ ; GCN-LABEL: {{^}}spill_csr_s5_copy: ; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 -; GCN: s_or_saveexec_b64 +; GCN: s_xor_saveexec_b64 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec -; GCN: v_writelane_b32 v41, [[FP_SCRATCH_COPY]], 0 +; GCN: v_writelane_b32 v40, [[FP_SCRATCH_COPY]], 4 ; GCN: s_swappc_b64 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 9 ; GCN: buffer_store_dword [[K]], off, s[0:3], s33{{$}} -; GCN: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v41, 0 -; GCN: s_or_saveexec_b64 +; GCN: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v40, 4 +; GCN: s_xor_saveexec_b64 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GCN: s_mov_b64 exec ; GCN: s_mov_b32 s33, [[FP_SCRATCH_COPY]] ; GCN: s_setpc_b64 diff --git a/llvm/test/CodeGen/AMDGPU/spill-reg-tuple-super-reg-use.mir b/llvm/test/CodeGen/AMDGPU/spill-reg-tuple-super-reg-use.mir --- a/llvm/test/CodeGen/AMDGPU/spill-reg-tuple-super-reg-use.mir +++ b/llvm/test/CodeGen/AMDGPU/spill-reg-tuple-super-reg-use.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=si-lower-sgpr-spills,prologepilog,machine-cp -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -start-before=si-lower-sgpr-spills -stop-after=prologepilog -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s # Make sure the initial first $sgpr1 = COPY $sgpr2 copy is not deleted # by the copy propagation after lowering the spill. @@ -25,12 +25,14 @@ ; GCN-NEXT: $sgpr8_sgpr9 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr8_sgpr9 + ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF ; GCN-NEXT: renamable $sgpr1 = COPY $sgpr2 - ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr0, 0, $vgpr0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr1, 1, $vgpr0 - ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr2, 2, $vgpr0 - ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr3, 3, $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: renamable $sgpr8 = COPY killed renamable $sgpr1 + ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr0, 0, killed $vgpr0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr1, 1, killed $vgpr0 + ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr2, 2, killed $vgpr0 + ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr3, 3, killed $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-NEXT: renamable $sgpr8 = COPY renamable $sgpr1 + ; GCN-NEXT: KILL killed renamable $vgpr0 ; GCN-NEXT: $sgpr0_sgpr1 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1 @@ -62,11 +64,13 @@ ; GCN-NEXT: $sgpr8_sgpr9 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr8_sgpr9 + ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF ; GCN-NEXT: renamable $sgpr1 = COPY $sgpr2 - ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr0, 0, $vgpr0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr1, 1, $vgpr0 - ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr2, 2, $vgpr0 - ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr3, 3, $vgpr0, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr0, 0, killed $vgpr0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr1, 1, killed $vgpr0 + ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr2, 2, killed $vgpr0 + ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr3, 3, killed $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-NEXT: KILL killed renamable $vgpr0 ; GCN-NEXT: $sgpr0_sgpr1 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1 @@ -93,12 +97,12 @@ ; GCN-LABEL: name: spill_vgpr128_use_subreg ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = COPY $vgpr2 + ; GCN-NEXT: renamable $vgpr1 = COPY $vgpr2, implicit $exec ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0, addrspace 5) ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr1, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0 + 4, addrspace 5) ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 8, 0, 0, implicit $exec :: (store (s32) into %stack.0 + 8, addrspace 5) ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr3, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 12, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0 + 12, addrspace 5) - ; GCN-NEXT: renamable $vgpr8 = COPY killed renamable $vgpr1 + ; GCN-NEXT: renamable $vgpr8 = COPY $vgpr2, implicit $exec ; GCN-NEXT: S_ENDPGM 0, implicit $vgpr8 renamable $vgpr1 = COPY $vgpr2 SI_SPILL_V128_SAVE renamable $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, align 4, addrspace 5) @@ -123,11 +127,11 @@ ; GCN-LABEL: name: spill_vgpr128_use_kill ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = COPY $vgpr2 - ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0, addrspace 5) - ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0 + 4, addrspace 5) - ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 8, 0, 0, implicit $exec :: (store (s32) into %stack.0 + 8, addrspace 5) - ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 12, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0 + 12, addrspace 5) + ; GCN-NEXT: renamable $vgpr1 = COPY $vgpr2, implicit $exec + ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0, addrspace 5) + ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr1, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0 + 4, addrspace 5) + ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 8, 0, 0, implicit $exec :: (store (s32) into %stack.0 + 8, addrspace 5) + ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr3, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 12, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0 + 12, addrspace 5) ; GCN-NEXT: S_ENDPGM 0 renamable $vgpr1 = COPY $vgpr2 SI_SPILL_V128_SAVE renamable killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, align 4, addrspace 5) diff --git a/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir b/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir @@ -0,0 +1,319 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -run-pass=si-lower-sgpr-spills -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s + +# A simple SGPR spill. Implicit def for lane VGPR should be inserted just before the spill instruction. +--- +name: sgpr32_spill +tracksRegLiveness: true +frameInfo: + maxAlignment: 4 +stack: + - { id: 0, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill } +machineFunctionInfo: + isEntryFunction: false + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + stackPtrOffsetReg: '$sgpr32' + frameOffsetReg: '$sgpr33' + hasSpilledSGPRs: true +body: | + bb.0: + liveins: $sgpr30_sgpr31, $sgpr10 + ; GCN-LABEL: name: sgpr32_spill + ; GCN: liveins: $sgpr30_sgpr31, $sgpr10 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr10, 0, [[V_WRITELANE_B32_]] + ; GCN-NEXT: $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_]], 0 + ; GCN-NEXT: S_SETPC_B64 $sgpr30_sgpr31 + S_NOP 0 + SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 + renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 + S_SETPC_B64 $sgpr30_sgpr31 +... + +# Needed an additional virtual lane register as the lanes of current register are fully occupied while spilling a wide SGPR tuple. +# There must be two implicit def for the two lane VGPRs. + +--- +name: sgpr_spill_lane_crossover +tracksRegLiveness: true +frameInfo: + maxAlignment: 4 +stack: + - { id: 0, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill } + - { id: 1, type: spill-slot, size: 128, alignment: 4, stack-id: sgpr-spill } +machineFunctionInfo: + isEntryFunction: false + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + stackPtrOffsetReg: '$sgpr32' + frameOffsetReg: '$sgpr33' + hasSpilledSGPRs: true +body: | + bb.0: + liveins: $sgpr30_sgpr31, $sgpr10, $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, $sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79, $sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87, $sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-LABEL: name: sgpr_spill_lane_crossover + ; GCN: liveins: $sgpr10, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $vgpr0, $sgpr30_sgpr31, $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, $sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79, $sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87, $sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr64, 0, $vgpr0 + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr65, 1, $vgpr0 + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr66, 2, $vgpr0 + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr67, 3, $vgpr0 + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr68, 4, $vgpr0 + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr69, 5, $vgpr0 + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr70, 6, $vgpr0 + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr71, 7, $vgpr0 + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr72, 8, $vgpr0 + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr73, 9, $vgpr0 + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr74, 10, $vgpr0 + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr75, 11, $vgpr0 + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr76, 12, $vgpr0 + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr77, 13, $vgpr0 + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr78, 14, $vgpr0 + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr79, 15, $vgpr0 + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr80, 16, $vgpr0 + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr81, 17, $vgpr0 + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr82, 18, $vgpr0 + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr83, 19, $vgpr0 + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr84, 20, $vgpr0 + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr85, 21, $vgpr0 + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr86, 22, $vgpr0 + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr87, 23, $vgpr0 + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr88, 24, $vgpr0 + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr89, 25, $vgpr0 + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr90, 26, $vgpr0 + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr91, 27, $vgpr0 + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr92, 28, $vgpr0 + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr93, 29, $vgpr0 + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr94, 30, $vgpr0 + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr95, 31, $vgpr0 + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr10, 0, [[V_WRITELANE_B32_]] + ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr64, 1, [[V_WRITELANE_B32_1]], implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr65, 2, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr66, 3, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr67, 4, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr68, 5, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr69, 6, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr70, 7, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr71, 8, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr72, 9, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr73, 10, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr74, 11, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr75, 12, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr76, 13, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr77, 14, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr78, 15, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr79, 16, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr80, 17, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr81, 18, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr82, 19, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr83, 20, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr84, 21, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr85, 22, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr86, 23, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr87, 24, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr88, 25, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr89, 26, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr90, 27, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr91, 28, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr92, 29, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr93, 30, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr94, 31, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr95, 32, [[V_WRITELANE_B32_1]], implicit killed $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: $sgpr64 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 1, implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: $sgpr65 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 2 + ; GCN-NEXT: $sgpr66 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 3 + ; GCN-NEXT: $sgpr67 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 4 + ; GCN-NEXT: $sgpr68 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 5 + ; GCN-NEXT: $sgpr69 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 6 + ; GCN-NEXT: $sgpr70 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 7 + ; GCN-NEXT: $sgpr71 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 8 + ; GCN-NEXT: $sgpr72 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 9 + ; GCN-NEXT: $sgpr73 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 10 + ; GCN-NEXT: $sgpr74 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 11 + ; GCN-NEXT: $sgpr75 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 12 + ; GCN-NEXT: $sgpr76 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 13 + ; GCN-NEXT: $sgpr77 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 14 + ; GCN-NEXT: $sgpr78 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 15 + ; GCN-NEXT: $sgpr79 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 16 + ; GCN-NEXT: $sgpr80 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 17 + ; GCN-NEXT: $sgpr81 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 18 + ; GCN-NEXT: $sgpr82 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 19 + ; GCN-NEXT: $sgpr83 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 20 + ; GCN-NEXT: $sgpr84 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 21 + ; GCN-NEXT: $sgpr85 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 22 + ; GCN-NEXT: $sgpr86 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 23 + ; GCN-NEXT: $sgpr87 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 24 + ; GCN-NEXT: $sgpr88 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 25 + ; GCN-NEXT: $sgpr89 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 26 + ; GCN-NEXT: $sgpr90 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 27 + ; GCN-NEXT: $sgpr91 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 28 + ; GCN-NEXT: $sgpr92 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 29 + ; GCN-NEXT: $sgpr93 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 30 + ; GCN-NEXT: $sgpr94 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 31 + ; GCN-NEXT: $sgpr95 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 32 + ; GCN-NEXT: $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 0 + ; GCN-NEXT: S_SETPC_B64 $sgpr30_sgpr31 + S_NOP 0 + SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 + SI_SPILL_S1024_SAVE killed $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 + S_NOP 0 + renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 + renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 + S_SETPC_B64 $sgpr30_sgpr31 +... + +# The implicit def for the lane VGPR should be inserted at the common dominator block (the entry block here). + +--- +name: lane_vgpr_implicit_def_at_common_dominator_block +tracksRegLiveness: true +frameInfo: + maxAlignment: 4 +stack: + - { id: 0, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill } +machineFunctionInfo: + isEntryFunction: false + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + stackPtrOffsetReg: '$sgpr32' + frameOffsetReg: '$sgpr33' + hasSpilledSGPRs: true +body: | + ; GCN-LABEL: name: lane_vgpr_implicit_def_at_common_dominator_block + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; GCN-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: S_CMP_EQ_U32 $sgpr11, 0, implicit-def $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit killed $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.3(0x80000000) + ; GCN-NEXT: liveins: $sgpr10, $sgpr30_sgpr31 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr10 = S_MOV_B32 10 + ; GCN-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr10, 0, [[V_WRITELANE_B32_]] + ; GCN-NEXT: S_BRANCH %bb.3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: successors: %bb.3(0x80000000) + ; GCN-NEXT: liveins: $sgpr10, $sgpr30_sgpr31 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr10 = S_MOV_B32 20 + ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr10, 0, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: S_BRANCH %bb.3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.3: + ; GCN-NEXT: liveins: $sgpr10, $sgpr30_sgpr31 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 0 + ; GCN-NEXT: S_SETPC_B64 $sgpr30_sgpr31, implicit $sgpr10 + bb.0: + liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31 + S_NOP 0 + S_CMP_EQ_U32 $sgpr11, 0, implicit-def $scc + S_CBRANCH_SCC1 %bb.2, implicit killed $scc + bb.1: + liveins: $sgpr10, $sgpr30_sgpr31 + $sgpr10 = S_MOV_B32 10 + SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 + S_BRANCH %bb.3 + bb.2: + liveins: $sgpr10, $sgpr30_sgpr31 + $sgpr10 = S_MOV_B32 20 + SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 + S_BRANCH %bb.3 + bb.3: + liveins: $sgpr10, $sgpr30_sgpr31 + renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 + S_SETPC_B64 $sgpr30_sgpr31, implicit $sgpr10 +... + +# The common dominator block is visited only at the end. The insertion point was initially identified to the +# terminator instruction in the dominator block which later becomes the point where a spill get inserted in the same block. + +--- +name: dominator_block_follows_the_successors_bbs +tracksRegLiveness: true +frameInfo: + maxAlignment: 4 +stack: + - { id: 0, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill } +machineFunctionInfo: + isEntryFunction: false + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + stackPtrOffsetReg: '$sgpr32' + frameOffsetReg: '$sgpr33' + hasSpilledSGPRs: true +body: | + ; GCN-LABEL: name: dominator_block_follows_the_successors_bbs + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.3(0x80000000) + ; GCN-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: S_BRANCH %bb.3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: liveins: $sgpr10, $sgpr30_sgpr31 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr10 = V_READLANE_B32 [[DEF]], 0 + ; GCN-NEXT: $sgpr10 = S_ADD_I32 $sgpr10, 15, implicit-def dead $scc + ; GCN-NEXT: S_BRANCH %bb.2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: successors: %bb.3(0x80000000) + ; GCN-NEXT: liveins: $sgpr10, $sgpr30_sgpr31 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr10 = V_READLANE_B32 [[DEF]], 0 + ; GCN-NEXT: $sgpr10 = S_ADD_I32 $sgpr10, 20, implicit-def dead $scc + ; GCN-NEXT: S_BRANCH %bb.3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.3: + ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; GCN-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr10 = S_MOV_B32 10 + ; GCN-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr10, 0, [[V_WRITELANE_B32_]] + ; GCN-NEXT: S_CMP_EQ_U32 $sgpr11, 0, implicit-def $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit killed $scc + ; GCN-NEXT: S_BRANCH %bb.1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.4: + ; GCN-NEXT: liveins: $sgpr10, $sgpr30_sgpr31 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: S_SETPC_B64 $sgpr30_sgpr31, implicit $sgpr10 + bb.0: + liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31 + S_NOP 0 + S_BRANCH %bb.3 + bb.1: + liveins: $sgpr10, $sgpr30_sgpr31 + renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 + $sgpr10 = S_ADD_I32 $sgpr10, 15, implicit-def dead $scc + S_BRANCH %bb.2 + bb.2: + liveins: $sgpr10, $sgpr30_sgpr31 + renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 + $sgpr10 = S_ADD_I32 $sgpr10, 20, implicit-def dead $scc + S_BRANCH %bb.3 + bb.3: + liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31 + $sgpr10 = S_MOV_B32 10 + SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 + S_CMP_EQ_U32 $sgpr11, 0, implicit-def $scc + S_CBRANCH_SCC1 %bb.2, implicit killed $scc + S_BRANCH %bb.1 + bb.4: + liveins: $sgpr10, $sgpr30_sgpr31 + S_NOP 0 + S_SETPC_B64 $sgpr30_sgpr31, implicit $sgpr10 +... diff --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll @@ -0,0 +1,92 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -O0 -verify-machineinstrs -o - %s | FileCheck %s + +; Regression test for `processFunctionBeforeFrameFinalized`: +; Check that it correctly updates RegisterScavenger so we +; don't end up with bad machine code due to using undefined +; physical registers. + +define void @test() { +; CHECK-LABEL: test: +; CHECK: ; %bb.0: ; %bb.0 +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[4:5] +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: .LBB0_1: ; %bb.1 +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: s_cbranch_scc1 .LBB0_3 +; CHECK-NEXT: ; %bb.2: ; %bb.2 +; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 +; CHECK-NEXT: .LBB0_3: ; %bb.3 +; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 +; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1 +; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 ; Reload Reuse +; CHECK-NEXT: s_mov_b64 exec, s[10:11] +; CHECK-NEXT: ; implicit-def: $sgpr4 +; CHECK-NEXT: v_mov_b32_e32 v1, s4 +; CHECK-NEXT: v_readfirstlane_b32 s6, v1 +; CHECK-NEXT: s_mov_b64 s[4:5], -1 +; CHECK-NEXT: s_mov_b32 s7, 0 +; CHECK-NEXT: s_cmp_eq_u32 s6, s7 +; CHECK-NEXT: v_writelane_b32 v0, s4, 0 +; CHECK-NEXT: v_writelane_b32 v0, s5, 1 +; CHECK-NEXT: s_mov_b64 s[10:11], exec +; CHECK-NEXT: s_mov_b64 exec, -1 +; CHECK-NEXT: v_accvgpr_write_b32 a0, v0 ; Reload Reuse +; CHECK-NEXT: s_mov_b64 exec, s[10:11] +; CHECK-NEXT: s_cbranch_scc1 .LBB0_5 +; CHECK-NEXT: ; %bb.4: ; %bb.4 +; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 +; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1 +; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 ; Reload Reuse +; CHECK-NEXT: s_mov_b64 exec, s[10:11] +; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: v_writelane_b32 v0, s4, 0 +; CHECK-NEXT: v_writelane_b32 v0, s5, 1 +; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: v_accvgpr_write_b32 a0, v0 ; Reload Reuse +; CHECK-NEXT: s_mov_b64 exec, s[10:11] +; CHECK-NEXT: .LBB0_5: ; %Flow +; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 +; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 ; Reload Reuse +; CHECK-NEXT: s_mov_b64 exec, s[10:11] +; CHECK-NEXT: v_readlane_b32 s4, v0, 0 +; CHECK-NEXT: v_readlane_b32 s5, v0, 1 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; CHECK-NEXT: s_mov_b32 s4, 1 +; CHECK-NEXT: ; implicit-def: $sgpr5 +; CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], v0, s4 +; CHECK-NEXT: s_and_b64 vcc, exec, s[4:5] +; CHECK-NEXT: s_cbranch_vccnz .LBB0_1 +; CHECK-NEXT: ; %bb.6: ; %bb.5 +; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1 +; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 ; Reload Reuse +; CHECK-NEXT: s_mov_b64 exec, s[10:11] +; CHECK-NEXT: ; kill: killed $vgpr0 +; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[4:5] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] +bb.0: + br label %bb.1 +bb.1: ; preds = %bb.4, %bb.0 + br i1 poison, label %bb.2, label %bb.3 +bb.2: ; preds = %bb.1 + br label %bb.3 +bb.3: ; preds = %bb.2, %bb.1 + %call = tail call i32 @llvm.amdgcn.readfirstlane(i32 poison) + %cmp = icmp eq i32 %call, 0 + br i1 %cmp, label %bb.5, label %bb.4 +bb.4: ; preds = %bb.3 + br label %bb.1 +bb.5: ; preds = %bb.3 + ret void +} + +declare i32 @llvm.amdgcn.readfirstlane(i32) diff --git a/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll b/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll --- a/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll @@ -10,16 +10,16 @@ ; GCN-LABEL: sgpr_spill_writelane: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_xor_saveexec_b64 s[6:7], -1 +; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_writelane_b32 v0, s35, 0 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: v_readlane_b32 s35, v0, 0 -; GCN-NEXT: s_xor_saveexec_b64 s[6:7], -1 +; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] call void asm sideeffect "", "~{s35}"() diff --git a/llvm/test/CodeGen/AMDGPU/spill192.mir b/llvm/test/CodeGen/AMDGPU/spill192.mir --- a/llvm/test/CodeGen/AMDGPU/spill192.mir +++ b/llvm/test/CodeGen/AMDGPU/spill192.mir @@ -32,32 +32,29 @@ ; EXPANDED-LABEL: name: spill_restore_sgpr192 ; EXPANDED: bb.0: ; EXPANDED-NEXT: successors: %bb.1(0x80000000) - ; EXPANDED-NEXT: liveins: $vgpr0 ; EXPANDED-NEXT: {{ $}} + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; EXPANDED-NEXT: S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, $vgpr0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr5, 1, $vgpr0 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr6, 2, $vgpr0 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr7, 3, $vgpr0 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr8, 4, $vgpr0 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr9, 5, $vgpr0, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 + ; EXPANDED-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]] + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr6, 2, [[V_WRITELANE_B32_1]] + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr7, 3, [[V_WRITELANE_B32_1]] + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr8, 4, [[V_WRITELANE_B32_1]] + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr9, 5, [[V_WRITELANE_B32_1]], implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 ; EXPANDED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: bb.1: ; EXPANDED-NEXT: successors: %bb.2(0x80000000) - ; EXPANDED-NEXT: liveins: $vgpr0 ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: S_NOP 1 ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: bb.2: - ; EXPANDED-NEXT: liveins: $vgpr0 - ; EXPANDED-NEXT: {{ $}} - ; EXPANDED-NEXT: $sgpr4 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 - ; EXPANDED-NEXT: $sgpr5 = V_READLANE_B32 $vgpr0, 1 - ; EXPANDED-NEXT: $sgpr6 = V_READLANE_B32 $vgpr0, 2 - ; EXPANDED-NEXT: $sgpr7 = V_READLANE_B32 $vgpr0, 3 - ; EXPANDED-NEXT: $sgpr8 = V_READLANE_B32 $vgpr0, 4 - ; EXPANDED-NEXT: $sgpr9 = V_READLANE_B32 $vgpr0, 5 + ; EXPANDED-NEXT: $sgpr4 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 + ; EXPANDED-NEXT: $sgpr5 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 1 + ; EXPANDED-NEXT: $sgpr6 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 2 + ; EXPANDED-NEXT: $sgpr7 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 3 + ; EXPANDED-NEXT: $sgpr8 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 4 + ; EXPANDED-NEXT: $sgpr9 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 5 ; EXPANDED-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 bb.0: S_NOP 0, implicit-def %0:sgpr_192 diff --git a/llvm/test/CodeGen/AMDGPU/spill224.mir b/llvm/test/CodeGen/AMDGPU/spill224.mir --- a/llvm/test/CodeGen/AMDGPU/spill224.mir +++ b/llvm/test/CodeGen/AMDGPU/spill224.mir @@ -30,34 +30,31 @@ ; EXPANDED-LABEL: name: spill_restore_sgpr224 ; EXPANDED: bb.0: ; EXPANDED-NEXT: successors: %bb.1(0x80000000) - ; EXPANDED-NEXT: liveins: $vgpr0 ; EXPANDED-NEXT: {{ $}} + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; EXPANDED-NEXT: S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, $vgpr0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr5, 1, $vgpr0 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr6, 2, $vgpr0 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr7, 3, $vgpr0 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr8, 4, $vgpr0 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr9, 5, $vgpr0 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr10, 6, $vgpr0, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 + ; EXPANDED-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]] + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr6, 2, [[V_WRITELANE_B32_1]] + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr7, 3, [[V_WRITELANE_B32_1]] + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr8, 4, [[V_WRITELANE_B32_1]] + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr9, 5, [[V_WRITELANE_B32_1]] + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr10, 6, [[V_WRITELANE_B32_1]], implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 ; EXPANDED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: bb.1: ; EXPANDED-NEXT: successors: %bb.2(0x80000000) - ; EXPANDED-NEXT: liveins: $vgpr0 ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: S_NOP 1 ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: bb.2: - ; EXPANDED-NEXT: liveins: $vgpr0 - ; EXPANDED-NEXT: {{ $}} - ; EXPANDED-NEXT: $sgpr4 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 - ; EXPANDED-NEXT: $sgpr5 = V_READLANE_B32 $vgpr0, 1 - ; EXPANDED-NEXT: $sgpr6 = V_READLANE_B32 $vgpr0, 2 - ; EXPANDED-NEXT: $sgpr7 = V_READLANE_B32 $vgpr0, 3 - ; EXPANDED-NEXT: $sgpr8 = V_READLANE_B32 $vgpr0, 4 - ; EXPANDED-NEXT: $sgpr9 = V_READLANE_B32 $vgpr0, 5 - ; EXPANDED-NEXT: $sgpr10 = V_READLANE_B32 $vgpr0, 6 + ; EXPANDED-NEXT: $sgpr4 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 + ; EXPANDED-NEXT: $sgpr5 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 1 + ; EXPANDED-NEXT: $sgpr6 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 2 + ; EXPANDED-NEXT: $sgpr7 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 3 + ; EXPANDED-NEXT: $sgpr8 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 4 + ; EXPANDED-NEXT: $sgpr9 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 5 + ; EXPANDED-NEXT: $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 6 ; EXPANDED-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 bb.0: S_NOP 0, implicit-def %0:sgpr_224 diff --git a/llvm/test/CodeGen/AMDGPU/spill288.mir b/llvm/test/CodeGen/AMDGPU/spill288.mir --- a/llvm/test/CodeGen/AMDGPU/spill288.mir +++ b/llvm/test/CodeGen/AMDGPU/spill288.mir @@ -30,38 +30,35 @@ ; EXPANDED-LABEL: name: spill_restore_sgpr288 ; EXPANDED: bb.0: ; EXPANDED-NEXT: successors: %bb.1(0x80000000) - ; EXPANDED-NEXT: liveins: $vgpr0 ; EXPANDED-NEXT: {{ $}} + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; EXPANDED-NEXT: S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, $vgpr0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr5, 1, $vgpr0 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr6, 2, $vgpr0 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr7, 3, $vgpr0 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr8, 4, $vgpr0 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr9, 5, $vgpr0 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr10, 6, $vgpr0 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr11, 7, $vgpr0 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr12, 8, $vgpr0, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 + ; EXPANDED-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]] + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr6, 2, [[V_WRITELANE_B32_1]] + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr7, 3, [[V_WRITELANE_B32_1]] + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr8, 4, [[V_WRITELANE_B32_1]] + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr9, 5, [[V_WRITELANE_B32_1]] + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr10, 6, [[V_WRITELANE_B32_1]] + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr11, 7, [[V_WRITELANE_B32_1]] + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr12, 8, [[V_WRITELANE_B32_1]], implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 ; EXPANDED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: bb.1: ; EXPANDED-NEXT: successors: %bb.2(0x80000000) - ; EXPANDED-NEXT: liveins: $vgpr0 ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: S_NOP 1 ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: bb.2: - ; EXPANDED-NEXT: liveins: $vgpr0 - ; EXPANDED-NEXT: {{ $}} - ; EXPANDED-NEXT: $sgpr4 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 - ; EXPANDED-NEXT: $sgpr5 = V_READLANE_B32 $vgpr0, 1 - ; EXPANDED-NEXT: $sgpr6 = V_READLANE_B32 $vgpr0, 2 - ; EXPANDED-NEXT: $sgpr7 = V_READLANE_B32 $vgpr0, 3 - ; EXPANDED-NEXT: $sgpr8 = V_READLANE_B32 $vgpr0, 4 - ; EXPANDED-NEXT: $sgpr9 = V_READLANE_B32 $vgpr0, 5 - ; EXPANDED-NEXT: $sgpr10 = V_READLANE_B32 $vgpr0, 6 - ; EXPANDED-NEXT: $sgpr11 = V_READLANE_B32 $vgpr0, 7 - ; EXPANDED-NEXT: $sgpr12 = V_READLANE_B32 $vgpr0, 8 + ; EXPANDED-NEXT: $sgpr4 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 + ; EXPANDED-NEXT: $sgpr5 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 1 + ; EXPANDED-NEXT: $sgpr6 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 2 + ; EXPANDED-NEXT: $sgpr7 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 3 + ; EXPANDED-NEXT: $sgpr8 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 4 + ; EXPANDED-NEXT: $sgpr9 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 5 + ; EXPANDED-NEXT: $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 6 + ; EXPANDED-NEXT: $sgpr11 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 7 + ; EXPANDED-NEXT: $sgpr12 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 8 ; EXPANDED-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 bb.0: S_NOP 0, implicit-def %0:sgpr_288 diff --git a/llvm/test/CodeGen/AMDGPU/spill320.mir b/llvm/test/CodeGen/AMDGPU/spill320.mir --- a/llvm/test/CodeGen/AMDGPU/spill320.mir +++ b/llvm/test/CodeGen/AMDGPU/spill320.mir @@ -30,40 +30,37 @@ ; EXPANDED-LABEL: name: spill_restore_sgpr320 ; EXPANDED: bb.0: ; EXPANDED-NEXT: successors: %bb.1(0x80000000) - ; EXPANDED-NEXT: liveins: $vgpr0 ; EXPANDED-NEXT: {{ $}} + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; EXPANDED-NEXT: S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, $vgpr0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr5, 1, $vgpr0 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr6, 2, $vgpr0 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr7, 3, $vgpr0 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr8, 4, $vgpr0 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr9, 5, $vgpr0 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr10, 6, $vgpr0 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr11, 7, $vgpr0 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr12, 8, $vgpr0 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr13, 9, $vgpr0, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 + ; EXPANDED-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]] + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr6, 2, [[V_WRITELANE_B32_1]] + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr7, 3, [[V_WRITELANE_B32_1]] + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr8, 4, [[V_WRITELANE_B32_1]] + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr9, 5, [[V_WRITELANE_B32_1]] + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr10, 6, [[V_WRITELANE_B32_1]] + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr11, 7, [[V_WRITELANE_B32_1]] + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr12, 8, [[V_WRITELANE_B32_1]] + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr13, 9, [[V_WRITELANE_B32_1]], implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; EXPANDED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: bb.1: ; EXPANDED-NEXT: successors: %bb.2(0x80000000) - ; EXPANDED-NEXT: liveins: $vgpr0 ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: S_NOP 1 ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: bb.2: - ; EXPANDED-NEXT: liveins: $vgpr0 - ; EXPANDED-NEXT: {{ $}} - ; EXPANDED-NEXT: $sgpr4 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 - ; EXPANDED-NEXT: $sgpr5 = V_READLANE_B32 $vgpr0, 1 - ; EXPANDED-NEXT: $sgpr6 = V_READLANE_B32 $vgpr0, 2 - ; EXPANDED-NEXT: $sgpr7 = V_READLANE_B32 $vgpr0, 3 - ; EXPANDED-NEXT: $sgpr8 = V_READLANE_B32 $vgpr0, 4 - ; EXPANDED-NEXT: $sgpr9 = V_READLANE_B32 $vgpr0, 5 - ; EXPANDED-NEXT: $sgpr10 = V_READLANE_B32 $vgpr0, 6 - ; EXPANDED-NEXT: $sgpr11 = V_READLANE_B32 $vgpr0, 7 - ; EXPANDED-NEXT: $sgpr12 = V_READLANE_B32 $vgpr0, 8 - ; EXPANDED-NEXT: $sgpr13 = V_READLANE_B32 $vgpr0, 9 + ; EXPANDED-NEXT: $sgpr4 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 + ; EXPANDED-NEXT: $sgpr5 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 1 + ; EXPANDED-NEXT: $sgpr6 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 2 + ; EXPANDED-NEXT: $sgpr7 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 3 + ; EXPANDED-NEXT: $sgpr8 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 4 + ; EXPANDED-NEXT: $sgpr9 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 5 + ; EXPANDED-NEXT: $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 6 + ; EXPANDED-NEXT: $sgpr11 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 7 + ; EXPANDED-NEXT: $sgpr12 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 8 + ; EXPANDED-NEXT: $sgpr13 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 9 ; EXPANDED-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 bb.0: S_NOP 0, implicit-def %0:sgpr_320 diff --git a/llvm/test/CodeGen/AMDGPU/spill352.mir b/llvm/test/CodeGen/AMDGPU/spill352.mir --- a/llvm/test/CodeGen/AMDGPU/spill352.mir +++ b/llvm/test/CodeGen/AMDGPU/spill352.mir @@ -30,42 +30,39 @@ ; EXPANDED-LABEL: name: spill_restore_sgpr352 ; EXPANDED: bb.0: ; EXPANDED-NEXT: successors: %bb.1(0x80000000) - ; EXPANDED-NEXT: liveins: $vgpr0 ; EXPANDED-NEXT: {{ $}} + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; EXPANDED-NEXT: S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, $vgpr0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr5, 1, $vgpr0 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr6, 2, $vgpr0 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr7, 3, $vgpr0 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr8, 4, $vgpr0 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr9, 5, $vgpr0 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr10, 6, $vgpr0 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr11, 7, $vgpr0 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr12, 8, $vgpr0 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr13, 9, $vgpr0 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr14, 10, $vgpr0, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 + ; EXPANDED-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]] + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr6, 2, [[V_WRITELANE_B32_1]] + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr7, 3, [[V_WRITELANE_B32_1]] + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr8, 4, [[V_WRITELANE_B32_1]] + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr9, 5, [[V_WRITELANE_B32_1]] + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr10, 6, [[V_WRITELANE_B32_1]] + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr11, 7, [[V_WRITELANE_B32_1]] + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr12, 8, [[V_WRITELANE_B32_1]] + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr13, 9, [[V_WRITELANE_B32_1]] + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr14, 10, [[V_WRITELANE_B32_1]], implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 ; EXPANDED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: bb.1: ; EXPANDED-NEXT: successors: %bb.2(0x80000000) - ; EXPANDED-NEXT: liveins: $vgpr0 ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: S_NOP 1 ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: bb.2: - ; EXPANDED-NEXT: liveins: $vgpr0 - ; EXPANDED-NEXT: {{ $}} - ; EXPANDED-NEXT: $sgpr4 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 - ; EXPANDED-NEXT: $sgpr5 = V_READLANE_B32 $vgpr0, 1 - ; EXPANDED-NEXT: $sgpr6 = V_READLANE_B32 $vgpr0, 2 - ; EXPANDED-NEXT: $sgpr7 = V_READLANE_B32 $vgpr0, 3 - ; EXPANDED-NEXT: $sgpr8 = V_READLANE_B32 $vgpr0, 4 - ; EXPANDED-NEXT: $sgpr9 = V_READLANE_B32 $vgpr0, 5 - ; EXPANDED-NEXT: $sgpr10 = V_READLANE_B32 $vgpr0, 6 - ; EXPANDED-NEXT: $sgpr11 = V_READLANE_B32 $vgpr0, 7 - ; EXPANDED-NEXT: $sgpr12 = V_READLANE_B32 $vgpr0, 8 - ; EXPANDED-NEXT: $sgpr13 = V_READLANE_B32 $vgpr0, 9 - ; EXPANDED-NEXT: $sgpr14 = V_READLANE_B32 $vgpr0, 10 + ; EXPANDED-NEXT: $sgpr4 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 + ; EXPANDED-NEXT: $sgpr5 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 1 + ; EXPANDED-NEXT: $sgpr6 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 2 + ; EXPANDED-NEXT: $sgpr7 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 3 + ; EXPANDED-NEXT: $sgpr8 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 4 + ; EXPANDED-NEXT: $sgpr9 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 5 + ; EXPANDED-NEXT: $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 6 + ; EXPANDED-NEXT: $sgpr11 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 7 + ; EXPANDED-NEXT: $sgpr12 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 8 + ; EXPANDED-NEXT: $sgpr13 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 9 + ; EXPANDED-NEXT: $sgpr14 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 10 ; EXPANDED-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 bb.0: S_NOP 0, implicit-def %0:sgpr_352 diff --git a/llvm/test/CodeGen/AMDGPU/spill384.mir b/llvm/test/CodeGen/AMDGPU/spill384.mir --- a/llvm/test/CodeGen/AMDGPU/spill384.mir +++ b/llvm/test/CodeGen/AMDGPU/spill384.mir @@ -30,44 +30,41 @@ ; EXPANDED-LABEL: name: spill_restore_sgpr384 ; EXPANDED: bb.0: ; EXPANDED-NEXT: successors: %bb.1(0x80000000) - ; EXPANDED-NEXT: liveins: $vgpr0 ; EXPANDED-NEXT: {{ $}} + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; EXPANDED-NEXT: S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, $vgpr0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr5, 1, $vgpr0 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr6, 2, $vgpr0 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr7, 3, $vgpr0 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr8, 4, $vgpr0 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr9, 5, $vgpr0 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr10, 6, $vgpr0 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr11, 7, $vgpr0 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr12, 8, $vgpr0 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr13, 9, $vgpr0 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr14, 10, $vgpr0 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr15, 11, $vgpr0, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; EXPANDED-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]] + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr6, 2, [[V_WRITELANE_B32_1]] + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr7, 3, [[V_WRITELANE_B32_1]] + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr8, 4, [[V_WRITELANE_B32_1]] + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr9, 5, [[V_WRITELANE_B32_1]] + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr10, 6, [[V_WRITELANE_B32_1]] + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr11, 7, [[V_WRITELANE_B32_1]] + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr12, 8, [[V_WRITELANE_B32_1]] + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr13, 9, [[V_WRITELANE_B32_1]] + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr14, 10, [[V_WRITELANE_B32_1]] + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr15, 11, [[V_WRITELANE_B32_1]], implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; EXPANDED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: bb.1: ; EXPANDED-NEXT: successors: %bb.2(0x80000000) - ; EXPANDED-NEXT: liveins: $vgpr0 ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: S_NOP 1 ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: bb.2: - ; EXPANDED-NEXT: liveins: $vgpr0 - ; EXPANDED-NEXT: {{ $}} - ; EXPANDED-NEXT: $sgpr4 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; EXPANDED-NEXT: $sgpr5 = V_READLANE_B32 $vgpr0, 1 - ; EXPANDED-NEXT: $sgpr6 = V_READLANE_B32 $vgpr0, 2 - ; EXPANDED-NEXT: $sgpr7 = V_READLANE_B32 $vgpr0, 3 - ; EXPANDED-NEXT: $sgpr8 = V_READLANE_B32 $vgpr0, 4 - ; EXPANDED-NEXT: $sgpr9 = V_READLANE_B32 $vgpr0, 5 - ; EXPANDED-NEXT: $sgpr10 = V_READLANE_B32 $vgpr0, 6 - ; EXPANDED-NEXT: $sgpr11 = V_READLANE_B32 $vgpr0, 7 - ; EXPANDED-NEXT: $sgpr12 = V_READLANE_B32 $vgpr0, 8 - ; EXPANDED-NEXT: $sgpr13 = V_READLANE_B32 $vgpr0, 9 - ; EXPANDED-NEXT: $sgpr14 = V_READLANE_B32 $vgpr0, 10 - ; EXPANDED-NEXT: $sgpr15 = V_READLANE_B32 $vgpr0, 11 + ; EXPANDED-NEXT: $sgpr4 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; EXPANDED-NEXT: $sgpr5 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 1 + ; EXPANDED-NEXT: $sgpr6 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 2 + ; EXPANDED-NEXT: $sgpr7 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 3 + ; EXPANDED-NEXT: $sgpr8 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 4 + ; EXPANDED-NEXT: $sgpr9 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 5 + ; EXPANDED-NEXT: $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 6 + ; EXPANDED-NEXT: $sgpr11 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 7 + ; EXPANDED-NEXT: $sgpr12 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 8 + ; EXPANDED-NEXT: $sgpr13 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 9 + ; EXPANDED-NEXT: $sgpr14 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 10 + ; EXPANDED-NEXT: $sgpr15 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 11 ; EXPANDED-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 bb.0: S_NOP 0, implicit-def %0:sgpr_384 diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll --- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll @@ -161,26 +161,24 @@ ; GCN-NEXT: s_and_b32 s33, [[SCRATCH_REG]], 0xffff0000 ; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GCN-NEXT: buffer_store_dword [[VGPR_REG:v[0-9]+]], off, s[0:3], s33 offset:1028 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword [[VGPR_REG_1:v[0-9]+]], off, s[0:3], s33 offset:1032 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[18:19] ; GCN-NEXT: v_mov_b32_e32 v32, 0 -; GCN-DAG: v_writelane_b32 [[VGPR_REG_1]], s34, 1 +; GCN-DAG: v_writelane_b32 [[VGPR_REG]], s34, 3 ; GCN: s_mov_b32 s34, s32 ; GCN: buffer_store_dword v32, off, s[0:3], s33 offset:1024 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34 ; GCN-DAG: s_add_i32 s32, s32, 0x30000 -; GCN: v_writelane_b32 [[VGPR_REG_1]], [[FP_SCRATCH_COPY]], 0 +; GCN: v_writelane_b32 [[VGPR_REG]], [[FP_SCRATCH_COPY]], 2 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 ; GCN: s_swappc_b64 s[30:31], ; GCN: v_readlane_b32 s31, [[VGPR_REG]], 1 ; GCN: v_readlane_b32 s30, [[VGPR_REG]], 0 -; GCN-NEXT: v_readlane_b32 s34, [[VGPR_REG_1]], 1 -; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[VGPR_REG_1]], 0 +; GCN-NEXT: v_readlane_b32 s34, [[VGPR_REG]], 3 +; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[VGPR_REG]], 2 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword [[VGPR_REG]], off, s[0:3], s33 offset:1028 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword [[VGPR_REG_1]], off, s[0:3], s33 offset:1032 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_add_i32 s32, s32, 0xfffd0000 ; GCN-NEXT: s_mov_b32 s33, [[FP_SCRATCH_COPY]] diff --git a/llvm/test/CodeGen/AMDGPU/swdev380865.ll b/llvm/test/CodeGen/AMDGPU/swdev380865.ll --- a/llvm/test/CodeGen/AMDGPU/swdev380865.ll +++ b/llvm/test/CodeGen/AMDGPU/swdev380865.ll @@ -16,94 +16,96 @@ ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_mov_b64 s[0:1], 0 ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x0 +; CHECK-NEXT: ; implicit-def: $vgpr2 ; CHECK-NEXT: ; kill: killed $sgpr0_sgpr1 ; CHECK-NEXT: s_mov_b32 s7, 0x401c0000 ; CHECK-NEXT: s_mov_b32 s5, 0x40280000 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_writelane_b32 v0, s2, 0 +; CHECK-NEXT: v_writelane_b32 v2, s2, 0 ; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; CHECK-NEXT: s_mov_b32 s0, 0 ; CHECK-NEXT: s_mov_b32 s1, 0x40140000 ; CHECK-NEXT: s_mov_b32 s1, 0x40180000 -; CHECK-NEXT: v_writelane_b32 v0, s0, 1 -; CHECK-NEXT: v_writelane_b32 v0, s1, 2 +; CHECK-NEXT: v_writelane_b32 v2, s0, 1 +; CHECK-NEXT: v_writelane_b32 v2, s1, 2 ; CHECK-NEXT: s_mov_b32 s1, 0x40220000 -; CHECK-NEXT: v_writelane_b32 v0, s0, 3 -; CHECK-NEXT: v_writelane_b32 v0, s1, 4 +; CHECK-NEXT: v_writelane_b32 v2, s0, 3 +; CHECK-NEXT: v_writelane_b32 v2, s1, 4 ; CHECK-NEXT: s_mov_b32 s1, 0x40240000 -; CHECK-NEXT: v_writelane_b32 v0, s0, 5 -; CHECK-NEXT: v_writelane_b32 v0, s1, 6 +; CHECK-NEXT: v_writelane_b32 v2, s0, 5 +; CHECK-NEXT: v_writelane_b32 v2, s1, 6 ; CHECK-NEXT: s_mov_b32 s1, 0x40260000 -; CHECK-NEXT: v_writelane_b32 v0, s0, 7 +; CHECK-NEXT: v_writelane_b32 v2, s0, 7 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s2 -; CHECK-NEXT: v_writelane_b32 v0, s1, 8 -; CHECK-NEXT: v_mov_b32_e32 v2, s3 +; CHECK-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-NEXT: v_writelane_b32 v2, s1, 8 +; CHECK-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-NEXT: .LBB0_1: ; %for.cond4.preheader ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_add_f64 v[1:2], v[1:2], 0 +; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], 0 ; CHECK-NEXT: s_mov_b32 s2, 0 ; CHECK-NEXT: s_mov_b32 s3, 0x40140000 -; CHECK-NEXT: v_writelane_b32 v0, s6, 9 -; CHECK-NEXT: v_writelane_b32 v0, s7, 10 -; CHECK-NEXT: v_writelane_b32 v0, s0, 11 -; CHECK-NEXT: v_readlane_b32 s6, v0, 1 -; CHECK-NEXT: v_readlane_b32 s7, v0, 2 -; CHECK-NEXT: v_add_f64 v[1:2], v[1:2], s[2:3] +; CHECK-NEXT: v_writelane_b32 v2, s6, 9 +; CHECK-NEXT: v_writelane_b32 v2, s7, 10 +; CHECK-NEXT: v_writelane_b32 v2, s0, 11 +; CHECK-NEXT: v_readlane_b32 s6, v2, 1 +; CHECK-NEXT: v_readlane_b32 s7, v2, 2 +; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] ; CHECK-NEXT: s_mov_b32 s1, s7 ; CHECK-NEXT: s_mov_b32 s0, s2 -; CHECK-NEXT: v_writelane_b32 v0, s6, 1 -; CHECK-NEXT: v_writelane_b32 v0, s7, 2 -; CHECK-NEXT: v_readlane_b32 s6, v0, 9 -; CHECK-NEXT: v_readlane_b32 s7, v0, 10 +; CHECK-NEXT: v_writelane_b32 v2, s6, 1 +; CHECK-NEXT: v_writelane_b32 v2, s7, 2 +; CHECK-NEXT: v_readlane_b32 s6, v2, 9 +; CHECK-NEXT: v_readlane_b32 s7, v2, 10 ; CHECK-NEXT: s_mov_b32 s6, s2 -; CHECK-NEXT: v_add_f64 v[1:2], v[1:2], s[0:1] -; CHECK-NEXT: v_readlane_b32 s0, v0, 3 -; CHECK-NEXT: v_readlane_b32 s1, v0, 4 +; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[0:1] +; CHECK-NEXT: v_readlane_b32 s0, v2, 3 +; CHECK-NEXT: v_readlane_b32 s1, v2, 4 ; CHECK-NEXT: s_mov_b32 s3, s1 ; CHECK-NEXT: s_mov_b32 s0, 0 ; CHECK-NEXT: s_mov_b32 s1, 0x40140000 ; CHECK-NEXT: s_mov_b32 s2, s0 ; CHECK-NEXT: s_mov_b32 s1, s3 -; CHECK-NEXT: v_add_f64 v[1:2], v[1:2], s[6:7] -; CHECK-NEXT: v_writelane_b32 v0, s0, 3 -; CHECK-NEXT: v_writelane_b32 v0, s1, 4 -; CHECK-NEXT: v_readlane_b32 s0, v0, 5 -; CHECK-NEXT: v_readlane_b32 s1, v0, 6 -; CHECK-NEXT: v_add_f64 v[1:2], v[1:2], s[2:3] +; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[6:7] +; CHECK-NEXT: v_writelane_b32 v2, s0, 3 +; CHECK-NEXT: v_writelane_b32 v2, s1, 4 +; CHECK-NEXT: v_readlane_b32 s0, v2, 5 +; CHECK-NEXT: v_readlane_b32 s1, v2, 6 +; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] ; CHECK-NEXT: s_mov_b32 s3, s1 ; CHECK-NEXT: s_mov_b32 s0, 0 ; CHECK-NEXT: s_mov_b32 s1, 0x40140000 ; CHECK-NEXT: s_mov_b32 s2, s0 ; CHECK-NEXT: s_mov_b32 s1, s3 -; CHECK-NEXT: v_writelane_b32 v0, s0, 5 -; CHECK-NEXT: v_writelane_b32 v0, s1, 6 -; CHECK-NEXT: v_add_f64 v[1:2], v[1:2], s[2:3] -; CHECK-NEXT: v_readlane_b32 s0, v0, 7 -; CHECK-NEXT: v_readlane_b32 s1, v0, 8 +; CHECK-NEXT: v_writelane_b32 v2, s0, 5 +; CHECK-NEXT: v_writelane_b32 v2, s1, 6 +; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] +; CHECK-NEXT: v_readlane_b32 s0, v2, 7 +; CHECK-NEXT: v_readlane_b32 s1, v2, 8 ; CHECK-NEXT: s_mov_b32 s3, s1 ; CHECK-NEXT: s_mov_b32 s0, 0 ; CHECK-NEXT: s_mov_b32 s1, 0x40140000 ; CHECK-NEXT: s_mov_b32 s2, s0 ; CHECK-NEXT: s_mov_b32 s1, s3 -; CHECK-NEXT: v_add_f64 v[1:2], v[1:2], s[2:3] -; CHECK-NEXT: v_writelane_b32 v0, s0, 7 -; CHECK-NEXT: v_writelane_b32 v0, s1, 8 +; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] +; CHECK-NEXT: v_writelane_b32 v2, s0, 7 +; CHECK-NEXT: v_writelane_b32 v2, s1, 8 ; CHECK-NEXT: s_mov_b32 s0, 0 ; CHECK-NEXT: s_mov_b32 s1, 0x40140000 ; CHECK-NEXT: s_mov_b32 s4, s0 -; CHECK-NEXT: v_readlane_b32 s0, v0, 0 -; CHECK-NEXT: v_readlane_b32 s2, v0, 11 -; CHECK-NEXT: v_add_f64 v[1:2], v[1:2], s[4:5] +; CHECK-NEXT: v_readlane_b32 s0, v2, 0 +; CHECK-NEXT: v_readlane_b32 s2, v2, 11 +; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[4:5] ; CHECK-NEXT: s_add_i32 s2, s2, s0 -; CHECK-NEXT: v_writelane_b32 v0, s2, 11 -; CHECK-NEXT: v_readlane_b32 s0, v0, 11 +; CHECK-NEXT: v_writelane_b32 v2, s2, 11 +; CHECK-NEXT: v_readlane_b32 s0, v2, 11 ; CHECK-NEXT: s_cmpk_lt_i32 s0, 0xa00 ; CHECK-NEXT: s_cbranch_scc1 .LBB0_1 ; CHECK-NEXT: ; %bb.2: ; %for.cond.cleanup.loopexit ; CHECK-NEXT: v_mov_b32_e32 v3, 0 ; CHECK-NEXT: v_mov_b32_e32 v4, 0 -; CHECK-NEXT: global_store_dwordx2 v[3:4], v[1:2], off +; CHECK-NEXT: global_store_dwordx2 v[3:4], v[0:1], off +; CHECK-NEXT: ; kill: killed $vgpr2 ; CHECK-NEXT: s_endpgm entry: %0 = load i32, ptr addrspace(4) null, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll --- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll +++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll @@ -12,10 +12,9 @@ ; GFX90A-NEXT: s_mov_b32 s33, s32 ; GFX90A-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX90A-NEXT: s_mov_b64 exec, s[18:19] ; GFX90A-NEXT: s_addk_i32 s32, 0x400 -; GFX90A-NEXT: v_writelane_b32 v41, s16, 0 +; GFX90A-NEXT: v_writelane_b32 v40, s16, 2 ; GFX90A-NEXT: s_getpc_b64 s[16:17] ; GFX90A-NEXT: s_add_u32 s16, s16, wobble@gotpcrel32@lo+4 ; GFX90A-NEXT: s_addc_u32 s17, s17, wobble@gotpcrel32@hi+12 @@ -35,12 +34,12 @@ ; GLOBALNESS1-NEXT: s_mov_b64 s[36:37], s[6:7] ; GLOBALNESS1-NEXT: s_load_dwordx4 s[76:79], s[8:9], 0x0 ; GLOBALNESS1-NEXT: s_load_dword s6, s[8:9], 0x14 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v42, v0 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v40, 0 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, v0 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v42, 0 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], 0, 0 -; GLOBALNESS1-NEXT: global_store_dword v[0:1], v40, off +; GLOBALNESS1-NEXT: global_store_dword v[0:1], v42, off ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS1-NEXT: global_load_dword v0, v40, s[76:77] +; GLOBALNESS1-NEXT: global_load_dword v0, v42, s[76:77] ; GLOBALNESS1-NEXT: s_mov_b64 s[40:41], s[4:5] ; GLOBALNESS1-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x18 ; GLOBALNESS1-NEXT: s_load_dword s7, s[8:9], 0x20 @@ -48,10 +47,10 @@ ; GLOBALNESS1-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GLOBALNESS1-NEXT: s_add_u32 s0, s0, s17 ; GLOBALNESS1-NEXT: s_addc_u32 s1, s1, 0 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, 0x40994400 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, 0x40994400 ; GLOBALNESS1-NEXT: s_bitcmp1_b32 s78, 0 ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e32 vcc, s[4:5], v[40:41] +; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e32 vcc, s[4:5], v[42:43] ; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e64 s[4:5], s[4:5], 0 ; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] ; GLOBALNESS1-NEXT: s_cselect_b64 s[4:5], -1, 0 @@ -117,9 +116,9 @@ ; GLOBALNESS1-NEXT: ; =>This Loop Header: Depth=1 ; GLOBALNESS1-NEXT: ; Child Loop BB1_15 Depth 2 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], s[74:75], s[74:75] op_sel:[0,1] -; GLOBALNESS1-NEXT: flat_load_dword v43, v[0:1] +; GLOBALNESS1-NEXT: flat_load_dword v40, v[0:1] ; GLOBALNESS1-NEXT: s_add_u32 s8, s38, 40 -; GLOBALNESS1-NEXT: buffer_store_dword v40, off, s[0:3], 0 +; GLOBALNESS1-NEXT: buffer_store_dword v42, off, s[0:3], 0 ; GLOBALNESS1-NEXT: flat_load_dword v46, v[0:1] ; GLOBALNESS1-NEXT: s_addc_u32 s9, s39, 0 ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41] @@ -128,7 +127,7 @@ ; GLOBALNESS1-NEXT: s_mov_b32 s12, s72 ; GLOBALNESS1-NEXT: s_mov_b32 s13, s71 ; GLOBALNESS1-NEXT: s_mov_b32 s14, s70 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v42 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) ; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[76:77] ; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[46:47] @@ -171,13 +170,13 @@ ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_12 ; GLOBALNESS1-NEXT: ; %bb.11: ; %bb39.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, v40 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v42 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[2:3], 0, 0 -; GLOBALNESS1-NEXT: global_store_dwordx2 v[2:3], v[40:41], off +; GLOBALNESS1-NEXT: global_store_dwordx2 v[2:3], v[42:43], off ; GLOBALNESS1-NEXT: .LBB1_12: ; %bb44.lr.ph.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: v_cmp_ne_u32_e32 vcc, 0, v46 -; GLOBALNESS1-NEXT: v_cndmask_b32_e32 v2, 0, v43, vcc +; GLOBALNESS1-NEXT: v_cndmask_b32_e32 v2, 0, v40, vcc ; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0) ; GLOBALNESS1-NEXT: v_cmp_nlt_f64_e64 s[64:65], 0, v[0:1] ; GLOBALNESS1-NEXT: v_cmp_eq_u32_e64 s[66:67], 0, v2 @@ -224,7 +223,7 @@ ; GLOBALNESS1-NEXT: s_mov_b32 s12, s72 ; GLOBALNESS1-NEXT: s_mov_b32 s13, s71 ; GLOBALNESS1-NEXT: s_mov_b32 s14, s70 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v42 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[76:77] ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[46:47], 0, 0 ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41] @@ -234,15 +233,15 @@ ; GLOBALNESS1-NEXT: s_mov_b32 s12, s72 ; GLOBALNESS1-NEXT: s_mov_b32 s13, s71 ; GLOBALNESS1-NEXT: s_mov_b32 s14, s70 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v42 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS1-NEXT: global_store_dwordx2 v[46:47], v[44:45], off ; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[76:77] ; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[66:67] ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_13 ; GLOBALNESS1-NEXT: ; %bb.22: ; %bb62.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, v40 -; GLOBALNESS1-NEXT: global_store_dwordx2 v[46:47], v[40:41], off +; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v42 +; GLOBALNESS1-NEXT: global_store_dwordx2 v[46:47], v[42:43], off ; GLOBALNESS1-NEXT: s_branch .LBB1_13 ; GLOBALNESS1-NEXT: .LBB1_23: ; %LeafBlock ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -269,15 +268,15 @@ ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_1 ; GLOBALNESS1-NEXT: ; %bb.28: ; %bb69.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, v40 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v42 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[2:3], 0, 0 -; GLOBALNESS1-NEXT: global_store_dwordx2 v[2:3], v[40:41], off +; GLOBALNESS1-NEXT: global_store_dwordx2 v[2:3], v[42:43], off ; GLOBALNESS1-NEXT: s_branch .LBB1_1 ; GLOBALNESS1-NEXT: .LBB1_29: ; %bb73.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, v40 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v42 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[2:3], 0, 0 -; GLOBALNESS1-NEXT: global_store_dwordx2 v[2:3], v[40:41], off +; GLOBALNESS1-NEXT: global_store_dwordx2 v[2:3], v[42:43], off ; GLOBALNESS1-NEXT: s_branch .LBB1_2 ; GLOBALNESS1-NEXT: .LBB1_30: ; %loop.exit.guard ; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[4:5] @@ -292,7 +291,7 @@ ; GLOBALNESS1-NEXT: s_mov_b32 s12, s72 ; GLOBALNESS1-NEXT: s_mov_b32 s13, s71 ; GLOBALNESS1-NEXT: s_mov_b32 s14, s70 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v42 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS1-NEXT: s_getpc_b64 s[16:17] ; GLOBALNESS1-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 ; GLOBALNESS1-NEXT: s_addc_u32 s17, s17, widget@rel32@hi+12 @@ -310,7 +309,7 @@ ; GLOBALNESS1-NEXT: s_mov_b32 s12, s72 ; GLOBALNESS1-NEXT: s_mov_b32 s13, s71 ; GLOBALNESS1-NEXT: s_mov_b32 s14, s70 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v42 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS1-NEXT: s_getpc_b64 s[16:17] ; GLOBALNESS1-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 ; GLOBALNESS1-NEXT: s_addc_u32 s17, s17, widget@rel32@hi+12 @@ -322,12 +321,12 @@ ; GLOBALNESS0-NEXT: s_mov_b64 s[36:37], s[6:7] ; GLOBALNESS0-NEXT: s_load_dwordx4 s[72:75], s[8:9], 0x0 ; GLOBALNESS0-NEXT: s_load_dword s6, s[8:9], 0x14 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v42, v0 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v40, 0 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, v0 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v42, 0 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], 0, 0 -; GLOBALNESS0-NEXT: global_store_dword v[0:1], v40, off +; GLOBALNESS0-NEXT: global_store_dword v[0:1], v42, off ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS0-NEXT: global_load_dword v0, v40, s[72:73] +; GLOBALNESS0-NEXT: global_load_dword v0, v42, s[72:73] ; GLOBALNESS0-NEXT: s_mov_b64 s[40:41], s[4:5] ; GLOBALNESS0-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x18 ; GLOBALNESS0-NEXT: s_load_dword s7, s[8:9], 0x20 @@ -335,10 +334,10 @@ ; GLOBALNESS0-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GLOBALNESS0-NEXT: s_add_u32 s0, s0, s17 ; GLOBALNESS0-NEXT: s_addc_u32 s1, s1, 0 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, 0x40994400 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, 0x40994400 ; GLOBALNESS0-NEXT: s_bitcmp1_b32 s74, 0 ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e32 vcc, s[4:5], v[40:41] +; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e32 vcc, s[4:5], v[42:43] ; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e64 s[4:5], s[4:5], 0 ; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] ; GLOBALNESS0-NEXT: s_cselect_b64 s[4:5], -1, 0 @@ -404,9 +403,9 @@ ; GLOBALNESS0-NEXT: ; =>This Loop Header: Depth=1 ; GLOBALNESS0-NEXT: ; Child Loop BB1_15 Depth 2 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], s[76:77], s[76:77] op_sel:[0,1] -; GLOBALNESS0-NEXT: flat_load_dword v43, v[0:1] +; GLOBALNESS0-NEXT: flat_load_dword v40, v[0:1] ; GLOBALNESS0-NEXT: s_add_u32 s8, s38, 40 -; GLOBALNESS0-NEXT: buffer_store_dword v40, off, s[0:3], 0 +; GLOBALNESS0-NEXT: buffer_store_dword v42, off, s[0:3], 0 ; GLOBALNESS0-NEXT: flat_load_dword v46, v[0:1] ; GLOBALNESS0-NEXT: s_addc_u32 s9, s39, 0 ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41] @@ -415,7 +414,7 @@ ; GLOBALNESS0-NEXT: s_mov_b32 s12, s70 ; GLOBALNESS0-NEXT: s_mov_b32 s13, s69 ; GLOBALNESS0-NEXT: s_mov_b32 s14, s68 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v42 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) ; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[78:79] ; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[46:47] @@ -458,13 +457,13 @@ ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_12 ; GLOBALNESS0-NEXT: ; %bb.11: ; %bb39.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, v40 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v42 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], 0, 0 -; GLOBALNESS0-NEXT: global_store_dwordx2 v[2:3], v[40:41], off +; GLOBALNESS0-NEXT: global_store_dwordx2 v[2:3], v[42:43], off ; GLOBALNESS0-NEXT: .LBB1_12: ; %bb44.lr.ph.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e32 vcc, 0, v46 -; GLOBALNESS0-NEXT: v_cndmask_b32_e32 v2, 0, v43, vcc +; GLOBALNESS0-NEXT: v_cndmask_b32_e32 v2, 0, v40, vcc ; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) ; GLOBALNESS0-NEXT: v_cmp_nlt_f64_e64 s[64:65], 0, v[0:1] ; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[66:67], 0, v2 @@ -511,7 +510,7 @@ ; GLOBALNESS0-NEXT: s_mov_b32 s12, s70 ; GLOBALNESS0-NEXT: s_mov_b32 s13, s69 ; GLOBALNESS0-NEXT: s_mov_b32 s14, s68 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v42 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[78:79] ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[46:47], 0, 0 ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41] @@ -521,15 +520,15 @@ ; GLOBALNESS0-NEXT: s_mov_b32 s12, s70 ; GLOBALNESS0-NEXT: s_mov_b32 s13, s69 ; GLOBALNESS0-NEXT: s_mov_b32 s14, s68 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v42 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS0-NEXT: global_store_dwordx2 v[46:47], v[44:45], off ; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[78:79] ; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[66:67] ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_13 ; GLOBALNESS0-NEXT: ; %bb.22: ; %bb62.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, v40 -; GLOBALNESS0-NEXT: global_store_dwordx2 v[46:47], v[40:41], off +; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v42 +; GLOBALNESS0-NEXT: global_store_dwordx2 v[46:47], v[42:43], off ; GLOBALNESS0-NEXT: s_branch .LBB1_13 ; GLOBALNESS0-NEXT: .LBB1_23: ; %LeafBlock ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -556,15 +555,15 @@ ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_1 ; GLOBALNESS0-NEXT: ; %bb.28: ; %bb69.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, v40 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v42 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], 0, 0 -; GLOBALNESS0-NEXT: global_store_dwordx2 v[2:3], v[40:41], off +; GLOBALNESS0-NEXT: global_store_dwordx2 v[2:3], v[42:43], off ; GLOBALNESS0-NEXT: s_branch .LBB1_1 ; GLOBALNESS0-NEXT: .LBB1_29: ; %bb73.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, v40 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v42 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], 0, 0 -; GLOBALNESS0-NEXT: global_store_dwordx2 v[2:3], v[40:41], off +; GLOBALNESS0-NEXT: global_store_dwordx2 v[2:3], v[42:43], off ; GLOBALNESS0-NEXT: s_branch .LBB1_2 ; GLOBALNESS0-NEXT: .LBB1_30: ; %loop.exit.guard ; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[4:5] @@ -579,7 +578,7 @@ ; GLOBALNESS0-NEXT: s_mov_b32 s12, s70 ; GLOBALNESS0-NEXT: s_mov_b32 s13, s69 ; GLOBALNESS0-NEXT: s_mov_b32 s14, s68 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v42 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS0-NEXT: s_getpc_b64 s[16:17] ; GLOBALNESS0-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 ; GLOBALNESS0-NEXT: s_addc_u32 s17, s17, widget@rel32@hi+12 @@ -597,7 +596,7 @@ ; GLOBALNESS0-NEXT: s_mov_b32 s12, s70 ; GLOBALNESS0-NEXT: s_mov_b32 s13, s69 ; GLOBALNESS0-NEXT: s_mov_b32 s14, s68 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v42 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS0-NEXT: s_getpc_b64 s[16:17] ; GLOBALNESS0-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 ; GLOBALNESS0-NEXT: s_addc_u32 s17, s17, widget@rel32@hi+12 diff --git a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll --- a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll +++ b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll @@ -10,9 +10,8 @@ ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[18:19] -; GCN-NEXT: v_writelane_b32 v42, s16, 0 +; GCN-NEXT: v_writelane_b32 v40, s16, 16 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: v_writelane_b32 v40, s30, 0 @@ -120,10 +119,9 @@ ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: v_readlane_b32 s4, v42, 0 +; GCN-NEXT: v_readlane_b32 s4, v40, 16 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s4 @@ -269,9 +267,8 @@ ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[18:19] -; GCN-NEXT: v_writelane_b32 v46, s16, 0 +; GCN-NEXT: v_writelane_b32 v40, s16, 28 ; GCN-NEXT: s_addk_i32 s32, 0x800 ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill @@ -474,10 +471,9 @@ ; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: v_readlane_b32 s4, v46, 0 +; GCN-NEXT: v_readlane_b32 s4, v40, 28 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_addk_i32 s32, 0xf800 ; GCN-NEXT: s_mov_b32 s33, s4 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll --- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll @@ -13,47 +13,61 @@ ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_add_u32 s0, s0, s15 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-NEXT: ; implicit-def: $vgpr1 ; CHECK-NEXT: v_mov_b32_e32 v2, v0 -; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: global_load_ushort v3, v0, s[4:5] offset:4 +; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1 +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[8:9] +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: global_load_ushort v3, v1, s[4:5] offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:8 ; 4-byte Folded Spill ; CHECK-NEXT: ; implicit-def: $sgpr4 ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: v_cmp_eq_u32_e64 s[6:7], v2, s4 ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: v_mov_b32_e32 v2, s4 -; CHECK-NEXT: ds_write_b8 v0, v2 +; CHECK-NEXT: ds_write_b8 v1, v2 ; CHECK-NEXT: s_mov_b64 s[4:5], exec -; CHECK-NEXT: v_writelane_b32 v1, s4, 0 -; CHECK-NEXT: v_writelane_b32 v1, s5, 1 +; CHECK-NEXT: v_writelane_b32 v0, s4, 0 +; CHECK-NEXT: v_writelane_b32 v0, s5, 1 +; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[8:9] ; CHECK-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_cbranch_execz .LBB0_2 ; CHECK-NEXT: ; %bb.1: ; %bb193 ; CHECK-NEXT: .LBB0_2: ; %bb194 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 ; 4-byte Folded Reload +; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1 +; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[8:9] +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_readlane_b32 s4, v1, 0 ; CHECK-NEXT: v_readlane_b32 s5, v1, 1 ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: s_mov_b32 s4, 0 -; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_ne_u16_e64 s[4:5], v0, s4 ; CHECK-NEXT: s_and_b64 vcc, exec, s[4:5] ; CHECK-NEXT: s_cbranch_vccnz .LBB0_4 ; CHECK-NEXT: ; %bb.3: ; %bb201 -; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:8 ; 4-byte Folded Reload ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, V2@rel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, V2@rel32@hi+12 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_short v0, v2, s[4:5] +; CHECK-NEXT: global_store_short v0, v1, s[4:5] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_barrier ; CHECK-NEXT: s_trap 2 ; CHECK-NEXT: ; divergent unreachable ; CHECK-NEXT: .LBB0_4: ; %UnifiedReturnBlock +; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1 +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[8:9] +; CHECK-NEXT: ; kill: killed $vgpr0 ; CHECK-NEXT: s_endpgm bb: %i10 = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll --- a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll @@ -15,7 +15,6 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v36, v16 ; GFX9-NEXT: v_mov_b32_e32 v35, v15 @@ -36,7 +35,7 @@ ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: image_gather4_c_b_cl v[41:44], v[32:36], s[4:11], s[4:7] dmask:0x1 ; GFX9-NEXT: s_addk_i32 s32, 0x800 -; GFX9-NEXT: v_writelane_b32 v45, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s4, 2 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 @@ -55,10 +54,9 @@ ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s4, v45, 0 +; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: s_addk_i32 s32, 0xf800 ; GFX9-NEXT: s_mov_b32 s33, s4 @@ -72,7 +70,6 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s5, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v36, v16 @@ -94,12 +91,12 @@ ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: image_gather4_c_b_cl v[41:44], v[32:36], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_addk_i32 s32, 0x400 -; GFX10-NEXT: v_writelane_b32 v45, s4, 0 +; GFX10-NEXT: v_writelane_b32 v40, s4, 2 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -114,11 +111,9 @@ ; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s4, v45, 0 +; GFX10-NEXT: v_readlane_b32 s4, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s5, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 -; GFX10-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:20 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s5 ; GFX10-NEXT: s_addk_i32 s32, 0xfc00 @@ -132,9 +127,7 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:16 -; GFX11-NEXT: scratch_store_b32 off, v45, s33 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:16 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_dual_mov_b32 v36, v16 :: v_dual_mov_b32 v35, v15 ; GFX11-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v33, v13 @@ -154,12 +147,12 @@ ; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: image_gather4_c_b_cl v[41:44], v[32:36], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX11-NEXT: s_add_i32 s32, s32, 32 -; GFX11-NEXT: v_writelane_b32 v45, s0, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, extern_func@gotpcrel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, extern_func@gotpcrel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -172,11 +165,9 @@ ; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:12 ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v45, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:16 -; GFX11-NEXT: scratch_load_b32 v45, off, s33 offset:20 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:16 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_addk_i32 s32, 0xffe0 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -214,7 +205,6 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill @@ -228,7 +218,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v41, v12 ; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[41:45], s[4:11], s[4:7] dmask:0x1 ; GFX9-NEXT: s_addk_i32 s32, 0x800 -; GFX9-NEXT: v_writelane_b32 v46, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s4, 2 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 @@ -248,10 +238,9 @@ ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s4, v46, 0 +; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: s_addk_i32 s32, 0xf800 ; GFX9-NEXT: s_mov_b32 s33, s4 @@ -265,7 +254,6 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s5, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s5 ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill @@ -275,18 +263,18 @@ ; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: image_gather4_c_b_cl v[0:3], v[12:16], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_addk_i32 s32, 0x400 -; GFX10-NEXT: v_writelane_b32 v46, s4, 0 +; GFX10-NEXT: v_writelane_b32 v40, s4, 2 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v41, v16 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v42, v15 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v43, v14 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: v_mov_b32_e32 v44, v13 ; GFX10-NEXT: v_mov_b32_e32 v45, v12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -300,11 +288,9 @@ ; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:16 ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s4, v46, 0 +; GFX10-NEXT: v_readlane_b32 s4, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s5, -1 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:20 -; GFX10-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:24 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s5 ; GFX10-NEXT: s_addk_i32 s32, 0xfc00 @@ -318,9 +304,7 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:20 -; GFX11-NEXT: scratch_store_b32 off, v46, s33 offset:24 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:20 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_clause 0x4 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:16 @@ -330,16 +314,16 @@ ; GFX11-NEXT: scratch_store_b32 off, v45, s33 ; GFX11-NEXT: image_gather4_c_b_cl v[0:3], v[12:16], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX11-NEXT: s_add_i32 s32, s32, 32 -; GFX11-NEXT: v_writelane_b32 v46, s0, 0 +; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, extern_func@gotpcrel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, extern_func@gotpcrel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v41, v16 :: v_dual_mov_b32 v42, v15 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v43, v14 :: v_dual_mov_b32 v44, v13 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: v_mov_b32_e32 v45, v12 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -353,11 +337,9 @@ ; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:16 ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v46, 0 +; GFX11-NEXT: v_readlane_b32 s0, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:20 -; GFX11-NEXT: scratch_load_b32 v46, off, s33 offset:24 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:20 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_addk_i32 s32, 0xffe0 ; GFX11-NEXT: s_mov_b32 s33, s0 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll b/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll --- a/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll @@ -9,17 +9,21 @@ define protected amdgpu_kernel void @kern(ptr %addr) !llvm.amdgcn.lds.kernel.id !0 { ; CHECK-LABEL: kern: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_mov_b32 s32, 0 +; CHECK-NEXT: s_mov_b32 s32, 0x200 ; CHECK-NEXT: s_add_u32 s12, s12, s17 ; CHECK-NEXT: s_addc_u32 s13, s13, 0 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; CHECK-NEXT: s_add_u32 s0, s0, s17 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: v_writelane_b32 v40, s16, 0 +; CHECK-NEXT: ; implicit-def: $vgpr3 +; CHECK-NEXT: v_writelane_b32 v3, s16, 0 +; CHECK-NEXT: s_or_saveexec_b32 s33, -1 +; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b32 exec_lo, s33 ; CHECK-NEXT: s_mov_b32 s13, s15 ; CHECK-NEXT: s_mov_b32 s12, s14 -; CHECK-NEXT: v_readlane_b32 s14, v40, 0 +; CHECK-NEXT: v_readlane_b32 s14, v3, 0 ; CHECK-NEXT: s_mov_b64 s[16:17], s[8:9] ; CHECK-NEXT: s_load_dwordx2 s[8:9], s[16:17], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v5, 42 @@ -52,6 +56,10 @@ ; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] +; CHECK-NEXT: s_or_saveexec_b32 s33, -1 +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b32 exec_lo, s33 +; CHECK-NEXT: ; kill: killed $vgpr0 ; CHECK-NEXT: s_endpgm store i32 42, ptr %addr call fastcc void @unknown_call() diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -2858,26 +2858,23 @@ ; GFX1032-NEXT: s_mov_b32 s33, s32 ; GFX1032-NEXT: s_or_saveexec_b32 s17, -1 ; GFX1032-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX1032-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_mov_b32 exec_lo, s17 ; GFX1032-NEXT: s_addk_i32 s32, 0x200 -; GFX1032-NEXT: v_writelane_b32 v41, s16, 0 +; GFX1032-NEXT: v_writelane_b32 v40, s16, 2 ; GFX1032-NEXT: s_getpc_b64 s[16:17] ; GFX1032-NEXT: s_add_u32 s16, s16, external_void_func_void@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s17, s17, external_void_func_void@gotpcrel32@hi+12 -; GFX1032-NEXT: v_writelane_b32 v40, s30, 0 ; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX1032-NEXT: v_writelane_b32 v40, s30, 0 ; GFX1032-NEXT: v_writelane_b32 v40, s31, 1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: v_readlane_b32 s31, v40, 1 ; GFX1032-NEXT: v_readlane_b32 s30, v40, 0 -; GFX1032-NEXT: v_readlane_b32 s4, v41, 0 +; GFX1032-NEXT: v_readlane_b32 s4, v40, 2 ; GFX1032-NEXT: s_or_saveexec_b32 s5, -1 -; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX1032-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX1032-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_mov_b32 exec_lo, s5 ; GFX1032-NEXT: s_addk_i32 s32, 0xfe00 @@ -2892,26 +2889,23 @@ ; GFX1064-NEXT: s_mov_b32 s33, s32 ; GFX1064-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX1064-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX1064-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_mov_b64 exec, s[18:19] ; GFX1064-NEXT: s_addk_i32 s32, 0x400 -; GFX1064-NEXT: v_writelane_b32 v41, s16, 0 +; GFX1064-NEXT: v_writelane_b32 v40, s16, 2 ; GFX1064-NEXT: s_getpc_b64 s[16:17] ; GFX1064-NEXT: s_add_u32 s16, s16, external_void_func_void@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s17, s17, external_void_func_void@gotpcrel32@hi+12 -; GFX1064-NEXT: v_writelane_b32 v40, s30, 0 ; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX1064-NEXT: v_writelane_b32 v40, s30, 0 ; GFX1064-NEXT: v_writelane_b32 v40, s31, 1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: v_readlane_b32 s31, v40, 1 ; GFX1064-NEXT: v_readlane_b32 s30, v40, 0 -; GFX1064-NEXT: v_readlane_b32 s4, v41, 0 +; GFX1064-NEXT: v_readlane_b32 s4, v40, 2 ; GFX1064-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v40, off, s[0:3], s33 -; GFX1064-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX1064-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_mov_b64 exec, s[6:7] ; GFX1064-NEXT: s_addk_i32 s32, 0xfc00 diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll @@ -0,0 +1,68 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefix=GFX90A %s + +; The test forces a high vector register pressure and there won't be sufficient VGPRs to be allocated +; for writelane/readlane SGPR spill instructions. Regalloc would split the vector register liverange +; by introducing a copy to AGPR register. The VGPR store to AGPR (v_accvgpr_write_b32) and later the +; restore from AGPR (v_accvgpr_read_b32) should be whole-wave operations and hence exec mask should be +; manipulated to ensure all lanes are active when these instructions are executed. +define void @vector_reg_liverange_split() #0 { +; GFX90A-LABEL: vector_reg_liverange_split: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b32 s16, s33 +; GFX90A-NEXT: s_mov_b32 s33, s32 +; GFX90A-NEXT: s_xor_saveexec_b64 s[18:19], -1 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, -1 +; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword a32, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, s[18:19] +; GFX90A-NEXT: v_writelane_b32 v40, s16, 2 +; GFX90A-NEXT: ; implicit-def: $vgpr0 +; GFX90A-NEXT: v_writelane_b32 v40, s30, 0 +; GFX90A-NEXT: s_addk_i32 s32, 0x400 +; GFX90A-NEXT: v_writelane_b32 v40, s31, 1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s20 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_writelane_b32 v0, s20, 0 +; GFX90A-NEXT: s_or_saveexec_b64 s[28:29], -1 +; GFX90A-NEXT: v_accvgpr_write_b32 a32, v0 +; GFX90A-NEXT: s_mov_b64 exec, s[28:29] +; GFX90A-NEXT: s_getpc_b64 s[16:17] +; GFX90A-NEXT: s_add_u32 s16, s16, foo@gotpcrel32@lo+4 +; GFX90A-NEXT: s_addc_u32 s17, s17, foo@gotpcrel32@hi+12 +; GFX90A-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX90A-NEXT: s_or_saveexec_b64 s[28:29], -1 +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a32 +; GFX90A-NEXT: s_mov_b64 exec, s[28:29] +; GFX90A-NEXT: v_readlane_b32 s20, v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s20 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_readlane_b32 s31, v40, 1 +; GFX90A-NEXT: v_readlane_b32 s30, v40, 0 +; GFX90A-NEXT: ; kill: killed $vgpr0 +; GFX90A-NEXT: v_readlane_b32 s4, v40, 2 +; GFX90A-NEXT: s_xor_saveexec_b64 s[6:7], -1 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX90A-NEXT: s_mov_b64 exec, -1 +; GFX90A-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword a32, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX90A-NEXT: s_mov_b64 exec, s[6:7] +; GFX90A-NEXT: s_addk_i32 s32, 0xfc00 +; GFX90A-NEXT: s_mov_b32 s33, s4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] + %s20 = call i32 asm sideeffect "; def $0","=${s20}"() + call void @foo() + call void asm sideeffect "; use $0","${s20}"(i32 %s20) + ret void +} + +declare void @foo() + +attributes #0 = { "amdgpu-num-vgpr"="41" "amdgpu-num-sgpr"="34"} diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll @@ -0,0 +1,140 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 --verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -O0 --verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN-O0 %s + +; Test whole-wave register spilling. + +; In this testcase, the return address registers, PC value (SGPR30_SGPR31) and the scratch SGPR used in +; the inline asm statements should be preserved across the call. Since the test limits the VGPR numbers, +; the PC will be spilled to the only available CSR VGPR (VGPR40) as we spill CSR SGPRs including the PC +; directly to the physical VGPR lane to correctly generate the CFIs. The SGPR20 will get spilled to the +; virtual VGPR lane and that would be allocated by regalloc. Since there is no free VGPR to allocate, RA +; must spill a scratch VGPR. The writelane/readlane instructions that spill/restore SGPRs into/from VGPR +; are whole-wave operations and hence the VGPRs involved in such operations require whole-wave spilling. + +define void @test() #0 { +; GCN-LABEL: test: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s16, s33 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_xor_saveexec_b64 s[18:19], -1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[18:19] +; GCN-NEXT: v_writelane_b32 v40, s28, 2 +; GCN-NEXT: v_writelane_b32 v40, s29, 3 +; GCN-NEXT: v_writelane_b32 v40, s16, 4 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: v_writelane_b32 v40, s30, 0 +; GCN-NEXT: s_addk_i32 s32, 0x800 +; GCN-NEXT: v_writelane_b32 v40, s31, 1 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; def s16 +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: v_writelane_b32 v0, s16, 0 +; GCN-NEXT: s_or_saveexec_b64 s[28:29], -1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[28:29] +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, ext_func@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, ext_func@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GCN-NEXT: s_or_saveexec_b64 s[28:29], -1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[28:29] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readlane_b32 s4, v1, 0 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: global_store_dword v[0:1], v0, off +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readlane_b32 s31, v40, 1 +; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: v_readlane_b32 s28, v40, 2 +; GCN-NEXT: v_readlane_b32 s29, v40, 3 +; GCN-NEXT: v_readlane_b32 s4, v40, 4 +; GCN-NEXT: s_xor_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_addk_i32 s32, 0xf800 +; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-O0-LABEL: test: +; GCN-O0: ; %bb.0: +; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-O0-NEXT: s_mov_b32 s16, s33 +; GCN-O0-NEXT: s_mov_b32 s33, s32 +; GCN-O0-NEXT: s_xor_saveexec_b64 s[18:19], -1 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, -1 +; GCN-O0-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[18:19] +; GCN-O0-NEXT: v_writelane_b32 v40, s28, 2 +; GCN-O0-NEXT: v_writelane_b32 v40, s29, 3 +; GCN-O0-NEXT: v_writelane_b32 v40, s16, 4 +; GCN-O0-NEXT: s_add_i32 s32, s32, 0x400 +; GCN-O0-NEXT: ; implicit-def: $vgpr0 +; GCN-O0-NEXT: v_writelane_b32 v40, s30, 0 +; GCN-O0-NEXT: v_writelane_b32 v40, s31, 1 +; GCN-O0-NEXT: ;;#ASMSTART +; GCN-O0-NEXT: ; def s16 +; GCN-O0-NEXT: ;;#ASMEND +; GCN-O0-NEXT: v_writelane_b32 v0, s16, 0 +; GCN-O0-NEXT: s_or_saveexec_b64 s[28:29], -1 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[28:29] +; GCN-O0-NEXT: s_getpc_b64 s[16:17] +; GCN-O0-NEXT: s_add_u32 s16, s16, ext_func@gotpcrel32@lo+4 +; GCN-O0-NEXT: s_addc_u32 s17, s17, ext_func@gotpcrel32@hi+12 +; GCN-O0-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GCN-O0-NEXT: s_mov_b64 s[22:23], s[2:3] +; GCN-O0-NEXT: s_mov_b64 s[20:21], s[0:1] +; GCN-O0-NEXT: s_mov_b64 s[0:1], s[20:21] +; GCN-O0-NEXT: s_mov_b64 s[2:3], s[22:23] +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GCN-O0-NEXT: s_or_saveexec_b64 s[28:29], -1 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[28:29] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s4, v0, 0 +; GCN-O0-NEXT: ; implicit-def: $sgpr6_sgpr7 +; GCN-O0-NEXT: v_mov_b32_e32 v1, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v2, s7 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s4 +; GCN-O0-NEXT: global_store_dword v[1:2], v3, off +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s31, v40, 1 +; GCN-O0-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-O0-NEXT: ; kill: killed $vgpr0 +; GCN-O0-NEXT: v_readlane_b32 s28, v40, 2 +; GCN-O0-NEXT: v_readlane_b32 s29, v40, 3 +; GCN-O0-NEXT: v_readlane_b32 s4, v40, 4 +; GCN-O0-NEXT: s_xor_saveexec_b64 s[6:7], -1 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, -1 +; GCN-O0-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] +; GCN-O0-NEXT: s_add_i32 s32, s32, 0xfffffc00 +; GCN-O0-NEXT: s_mov_b32 s33, s4 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: s_setpc_b64 s[30:31] + %sgpr = call i32 asm sideeffect "; def $0", "=s" () #0 + call void @ext_func() + store volatile i32 %sgpr, ptr addrspace(1) undef + ret void +} + +declare void @ext_func(); + +attributes #0 = { nounwind "amdgpu-num-vgpr"="41" "amdgpu-num-sgpr"="34"} diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll @@ -141,10 +141,15 @@ ; GFX9-O0: ; %bb.0: ; %entry ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O0-NEXT: ; implicit-def: $vgpr3 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[46:47], -1 +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[46:47] ; GFX9-O0-NEXT: s_mov_b32 s40, s6 ; GFX9-O0-NEXT: s_mov_b32 s34, s4 ; GFX9-O0-NEXT: ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41 @@ -159,17 +164,18 @@ ; GFX9-O0-NEXT: s_mov_b32 s37, s44 ; GFX9-O0-NEXT: s_mov_b32 s38, s43 ; GFX9-O0-NEXT: s_mov_b32 s39, s42 -; GFX9-O0-NEXT: v_writelane_b32 v3, s40, 0 -; GFX9-O0-NEXT: v_writelane_b32 v3, s41, 1 -; GFX9-O0-NEXT: v_writelane_b32 v3, s34, 2 -; GFX9-O0-NEXT: v_writelane_b32 v3, s35, 3 +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_writelane_b32 v0, s40, 0 +; GFX9-O0-NEXT: v_writelane_b32 v0, s41, 1 +; GFX9-O0-NEXT: v_writelane_b32 v0, s34, 2 +; GFX9-O0-NEXT: v_writelane_b32 v0, s35, 3 ; GFX9-O0-NEXT: s_mov_b32 s34, 0 ; GFX9-O0-NEXT: s_nop 2 ; GFX9-O0-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], s34 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-O0-NEXT: s_not_b64 exec, exec @@ -182,21 +188,24 @@ ; GFX9-O0-NEXT: v_add_u32_e64 v1, v1, v2 ; GFX9-O0-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[36:37], v0, s34 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, s34 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[36:37], v3, s34 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, s34 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[34:35], exec -; GFX9-O0-NEXT: v_writelane_b32 v3, s34, 4 -; GFX9-O0-NEXT: v_writelane_b32 v3, s35, 5 +; GFX9-O0-NEXT: v_writelane_b32 v0, s34, 4 +; GFX9-O0-NEXT: v_writelane_b32 v0, s35, 5 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[46:47], -1 +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 exec, s[46:47] ; GFX9-O0-NEXT: s_and_b64 s[34:35], s[34:35], s[36:37] ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_cbranch_execz .LBB1_2 ; GFX9-O0-NEXT: ; %bb.1: ; %if -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] @@ -209,25 +218,28 @@ ; GFX9-O0-NEXT: v_add_u32_e64 v1, v2, v1 ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: .LBB1_2: ; %merge -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: v_readlane_b32 s36, v3, 4 -; GFX9-O0-NEXT: v_readlane_b32 s37, v3, 5 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[46:47], -1 +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[46:47] +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(2) +; GFX9-O0-NEXT: v_readlane_b32 s36, v0, 4 +; GFX9-O0-NEXT: v_readlane_b32 s37, v0, 5 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[36:37] -; GFX9-O0-NEXT: v_readlane_b32 s38, v3, 0 -; GFX9-O0-NEXT: v_readlane_b32 s39, v3, 1 -; GFX9-O0-NEXT: v_readlane_b32 s34, v3, 2 -; GFX9-O0-NEXT: v_readlane_b32 s35, v3, 3 +; GFX9-O0-NEXT: v_readlane_b32 s38, v0, 0 +; GFX9-O0-NEXT: v_readlane_b32 s39, v0, 1 +; GFX9-O0-NEXT: v_readlane_b32 s34, v0, 2 +; GFX9-O0-NEXT: v_readlane_b32 s35, v0, 3 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[36:37], v0, v4 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[36:37] +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[36:37], v3, v4 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[36:37] ; GFX9-O0-NEXT: s_mov_b32 s36, 1 -; GFX9-O0-NEXT: v_lshlrev_b32_e64 v0, s36, v0 +; GFX9-O0-NEXT: v_lshlrev_b32_e64 v3, s36, v3 ; GFX9-O0-NEXT: s_mov_b32 s36, 2 -; GFX9-O0-NEXT: v_and_b32_e64 v0, v0, s36 +; GFX9-O0-NEXT: v_and_b32_e64 v3, v3, s36 ; GFX9-O0-NEXT: s_mov_b32 s40, s35 ; GFX9-O0-NEXT: s_mov_b32 s36, s34 ; GFX9-O0-NEXT: s_mov_b32 s34, s39 @@ -237,11 +249,12 @@ ; GFX9-O0-NEXT: s_mov_b32 s38, s35 ; GFX9-O0-NEXT: s_mov_b32 s39, s34 ; GFX9-O0-NEXT: s_mov_b32 s34, 0 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[36:39], s34 offset:4 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[36:39], s34 offset:4 +; GFX9-O0-NEXT: ; kill: killed $vgpr0 ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] @@ -546,25 +559,28 @@ ; GFX9-O0-LABEL: strict_wwm_call_i64: ; GFX9-O0: ; %bb.0: ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-O0-NEXT: s_mov_b32 s46, s33 +; GFX9-O0-NEXT: s_mov_b32 s48, s33 ; GFX9-O0-NEXT: s_mov_b32 s33, s32 ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xc00 +; GFX9-O0-NEXT: s_add_i32 s32, s32, 0x1000 +; GFX9-O0-NEXT: ; implicit-def: $vgpr0 ; GFX9-O0-NEXT: v_writelane_b32 v10, s30, 0 ; GFX9-O0-NEXT: v_writelane_b32 v10, s31, 1 ; GFX9-O0-NEXT: s_mov_b32 s34, s8 @@ -582,10 +598,10 @@ ; GFX9-O0-NEXT: s_mov_b32 s41, s45 ; GFX9-O0-NEXT: s_mov_b32 s42, s44 ; GFX9-O0-NEXT: s_mov_b32 s43, s35 -; GFX9-O0-NEXT: v_writelane_b32 v10, s40, 2 -; GFX9-O0-NEXT: v_writelane_b32 v10, s41, 3 -; GFX9-O0-NEXT: v_writelane_b32 v10, s42, 4 -; GFX9-O0-NEXT: v_writelane_b32 v10, s43, 5 +; GFX9-O0-NEXT: v_writelane_b32 v0, s40, 0 +; GFX9-O0-NEXT: v_writelane_b32 v0, s41, 1 +; GFX9-O0-NEXT: v_writelane_b32 v0, s42, 2 +; GFX9-O0-NEXT: v_writelane_b32 v0, s43, 3 ; GFX9-O0-NEXT: ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35 ; GFX9-O0-NEXT: s_mov_b32 s35, s9 ; GFX9-O0-NEXT: ; kill: def $sgpr36_sgpr37 killed $sgpr34_sgpr35 @@ -597,8 +613,11 @@ ; GFX9-O0-NEXT: v_mov_b32_e32 v9, s37 ; GFX9-O0-NEXT: s_not_b64 exec, exec ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: v_writelane_b32 v10, s34, 6 -; GFX9-O0-NEXT: v_writelane_b32 v10, s35, 7 +; GFX9-O0-NEXT: v_writelane_b32 v0, s34, 4 +; GFX9-O0-NEXT: v_writelane_b32 v0, s35, 5 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[46:47], -1 +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 exec, s[46:47] ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v8 ; GFX9-O0-NEXT: s_mov_b32 s34, 32 ; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37 @@ -615,13 +634,20 @@ ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-O0-NEXT: v_readlane_b32 s34, v10, 6 -; GFX9-O0-NEXT: v_readlane_b32 s35, v10, 7 -; GFX9-O0-NEXT: v_readlane_b32 s36, v10, 2 -; GFX9-O0-NEXT: v_readlane_b32 s37, v10, 3 -; GFX9-O0-NEXT: v_readlane_b32 s38, v10, 4 -; GFX9-O0-NEXT: v_readlane_b32 s39, v10, 5 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[46:47], -1 +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[46:47] +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_readlane_b32 s34, v6, 4 +; GFX9-O0-NEXT: v_readlane_b32 s35, v6, 5 +; GFX9-O0-NEXT: v_readlane_b32 s36, v6, 0 +; GFX9-O0-NEXT: v_readlane_b32 s37, v6, 1 +; GFX9-O0-NEXT: v_readlane_b32 s38, v6, 2 +; GFX9-O0-NEXT: v_readlane_b32 s39, v6, 3 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[46:47], -1 +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[46:47] ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O0-NEXT: ; implicit-def: $sgpr40 ; GFX9-O0-NEXT: ; implicit-def: $sgpr40 @@ -630,27 +656,30 @@ ; GFX9-O0-NEXT: v_add_co_u32_e64 v2, s[40:41], v2, v4 ; GFX9-O0-NEXT: v_addc_co_u32_e64 v3, s[40:41], v3, v5, s[40:41] ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-O0-NEXT: s_mov_b32 s34, 0 -; GFX9-O0-NEXT: buffer_store_dwordx2 v[0:1], off, s[36:39], s34 offset:4 +; GFX9-O0-NEXT: buffer_store_dwordx2 v[6:7], off, s[36:39], s34 offset:4 ; GFX9-O0-NEXT: v_readlane_b32 s31, v10, 1 ; GFX9-O0-NEXT: v_readlane_b32 s30, v10, 0 +; GFX9-O0-NEXT: ; kill: killed $vgpr0 ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffff400 -; GFX9-O0-NEXT: s_mov_b32 s33, s46 +; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffff000 +; GFX9-O0-NEXT: s_mov_b32 s33, s48 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] ; @@ -909,7 +938,7 @@ ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, -1 -; GFX9-O0-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill @@ -922,35 +951,35 @@ ; GFX9-O0-NEXT: v_writelane_b32 v42, s65, 1 ; GFX9-O0-NEXT: v_writelane_b32 v42, s66, 2 ; GFX9-O0-NEXT: v_writelane_b32 v42, s67, 3 -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 @@ -960,145 +989,145 @@ ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-O0-NEXT: v_mov_b32_e32 v43, s5 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s11 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s12 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s13 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s14 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s15 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s16 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s17 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s18 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s20 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s21 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s22 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s23 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s24 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v47, s25 ; GFX9-O0-NEXT: v_mov_b32_e32 v46, s26 ; GFX9-O0-NEXT: v_mov_b32_e32 v45, s27 ; GFX9-O0-NEXT: v_mov_b32_e32 v44, s28 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s29 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v12, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v14, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v15, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v16, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v18, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v19, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v20, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-O0-NEXT: v_mov_b32_e32 v21, v47 -; GFX9-O0-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; GFX9-O0-NEXT: v_mov_b32_e32 v22, v46 -; GFX9-O0-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-O0-NEXT: v_mov_b32_e32 v23, v45 -; GFX9-O0-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-O0-NEXT: v_mov_b32_e32 v24, v44 -; GFX9-O0-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(4) ; GFX9-O0-NEXT: v_mov_b32_e32 v25, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v26, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; GFX9-O0-NEXT: v_mov_b32_e32 v27, v47 ; GFX9-O0-NEXT: v_mov_b32_e32 v28, v46 ; GFX9-O0-NEXT: v_mov_b32_e32 v29, v45 ; GFX9-O0-NEXT: v_mov_b32_e32 v30, v44 ; GFX9-O0-NEXT: ; kill: def $vgpr31 killed $vgpr43 killed $exec -; GFX9-O0-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec @@ -1227,7 +1256,7 @@ ; GFX9-O0-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, -1 -; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll @@ -125,7 +125,8 @@ ; GFX9-O0: v_add_u32_e64 v3, v3, v6 %tmp136 = add i32 %tmp134, %tmp107 %tmp137 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp136) -; GFX9: buffer_store_dword v0 +; GFX9-O0: buffer_store_dword v1 +; GFX9-O3: buffer_store_dword v0 call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %tmp137, ptr addrspace(8) %tmp14, i32 4, i32 0, i32 0) ret void } @@ -320,7 +321,8 @@ ; GFX9-O0: v_add_u32_e64 v3, v3, v6 %tmp136 = add i32 %tmp134, %tmp107 %tmp137 = tail call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp136) -; GFX9: buffer_store_dword v0 +; GFX9-O0: buffer_store_dword v1 +; GFX9-O3: buffer_store_dword v0 call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %tmp137, ptr addrspace(8) %tmp14, i32 4, i32 0, i32 0) ret void } @@ -406,4 +408,4 @@ declare void @llvm.amdgcn.raw.ptr.buffer.store.v2f32(<2 x float>, ptr addrspace(8), i32, i32, i32) declare void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float>, ptr addrspace(8), i32, i32, i32) declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32) -declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32) +declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32) \ No newline at end of file diff --git a/llvm/test/CodeGen/MIR/AMDGPU/stack-id-assert.mir b/llvm/test/CodeGen/MIR/AMDGPU/stack-id-assert.mir --- a/llvm/test/CodeGen/MIR/AMDGPU/stack-id-assert.mir +++ b/llvm/test/CodeGen/MIR/AMDGPU/stack-id-assert.mir @@ -3,7 +3,7 @@ # contains not dead objects only. So using objects IDs as offset in the storage # caused out of bounds access. -# RUN: llc -march=amdgcn -run-pass=si-lower-sgpr-spills,prologepilog -verify-machineinstrs -o - %s | FileCheck %s +# RUN: llc -march=amdgcn -start-before=si-lower-sgpr-spills -stop-after=prologepilog -verify-machineinstrs -o - %s | FileCheck %s # CHECK-LABEL: name: foo # CHECK: {{^}}fixedStack: []