diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -1520,6 +1520,9 @@ if (parseOptionalRegister(YamlMFI.VGPRForAGPRCopy, MFI->VGPRForAGPRCopy)) return true; + if (parseOptionalRegister(YamlMFI.SGPRForEXECCopy, MFI->SGPRForEXECCopy)) + return true; + auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) { // Create a diagnostic for a the register string literal. const MemoryBuffer &Buffer = diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -905,6 +905,13 @@ }; } // namespace ImplicitArg + +namespace VirtRegFlag { +// Virtual Register Flags. +enum Register_Flag : uint8_t { WWM_REG = 0 }; + +} // namespace VirtRegFlag + } // namespace AMDGPU #define R_00B028_SPI_SHADER_PGM_RSRC1_PS 0x00B028 diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h @@ -34,8 +34,8 @@ RegScavenger *RS = nullptr) const override; void determineCalleeSavesSGPR(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS = nullptr) const; - void determinePrologEpilogSGPRSaves(MachineFunction &MF, - BitVector &SavedRegs) const; + void determinePrologEpilogSGPRSaves(MachineFunction &MF, BitVector &SavedRegs, + bool NeedExecCopyReservedReg) const; void emitCSRSpillStores(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc &DL, LivePhysRegs &LiveRegs, Register FrameReg, diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -66,7 +66,8 @@ static void getVGPRSpillLaneOrTempRegister( MachineFunction &MF, LivePhysRegs &LiveRegs, Register SGPR, - const TargetRegisterClass &RC = AMDGPU::SReg_32_XM0_XEXECRegClass) { + const TargetRegisterClass &RC = AMDGPU::SReg_32_XM0_XEXECRegClass, + bool IncludeScratchCopy = true) { SIMachineFunctionInfo *MFI = MF.getInfo(); MachineFrameInfo &FrameInfo = MF.getFrameInfo(); @@ -77,9 +78,12 @@ // We need to save and restore the given SGPR. + Register ScratchSGPR; // 1: Try to save the given register into an unused scratch SGPR. The LiveRegs - // should have all the callee saved registers marked as used. - Register ScratchSGPR = findUnusedRegister(MF.getRegInfo(), LiveRegs, RC); + // should have all the callee saved registers marked as used. For certain + // cases we skip copy to scratch SGPR. + if (IncludeScratchCopy) + ScratchSGPR = findUnusedRegister(MF.getRegInfo(), LiveRegs, RC); if (!ScratchSGPR) { int FI = FrameInfo.CreateStackObject(Size, Alignment, true, nullptr, @@ -1348,8 +1352,8 @@ TII->getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg(); if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI, TRI->isAGPR(MRI, VReg))) { - // FIXME: change to enterBasicBlockEnd() - RS->enterBasicBlock(MBB); + RS->enterBasicBlockEnd(MBB); + RS->backward(MI); TRI->eliminateFrameIndex(MI, 0, FIOp, RS); SpillFIs.set(FI); continue; @@ -1446,8 +1450,10 @@ // The special SGPR spills like the one needed for FP, BP or any reserved // registers delayed until frame lowering. void SIFrameLowering::determinePrologEpilogSGPRSaves( - MachineFunction &MF, BitVector &SavedVGPRs) const { + MachineFunction &MF, BitVector &SavedVGPRs, + bool NeedExecCopyReservedReg) const { MachineFrameInfo &FrameInfo = MF.getFrameInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); SIMachineFunctionInfo *MFI = MF.getInfo(); const GCNSubtarget &ST = MF.getSubtarget(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); @@ -1459,6 +1465,27 @@ for (unsigned I = 0; CSRegs[I]; ++I) LiveRegs.addReg(CSRegs[I]); + if (NeedExecCopyReservedReg) { + Register ReservedReg = MFI->getSGPRForEXECCopy(); + assert(ReservedReg && "Should have reserved an SGPR for EXEC copy."); + const TargetRegisterClass &RC = ST.isWave32() + ? AMDGPU::SReg_32_XM0_XEXECRegClass + : AMDGPU::SGPR_64RegClass; + Register UnusedScratchReg = findUnusedRegister(MRI, LiveRegs, RC); + if (UnusedScratchReg) { + // If found any unused scratch SGPR, reserve the register itself for Exec + // copy and there is no need for any spills in that case. + MFI->setSGPRForEXECCopy(UnusedScratchReg); + LiveRegs.addReg(UnusedScratchReg); + } else { + // Needs spill. + assert(!MFI->hasPrologEpilogSGPRSpillEntry(ReservedReg) && + "Re-reserving spill slot for EXEC copy register"); + getVGPRSpillLaneOrTempRegister(MF, LiveRegs, ReservedReg, RC, + /* IncludeScratchCopy */ false); + } + } + // hasFP only knows about stack objects that already exist. We're now // determining the stack slots that will be created, so we have to predict // them. Stack objects force FP usage with calls. @@ -1497,6 +1524,8 @@ const GCNSubtarget &ST = MF.getSubtarget(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const SIInstrInfo *TII = ST.getInstrInfo(); + bool NeedExecCopyReservedReg = false; for (MachineBasicBlock &MBB : MF) { for (MachineInstr &MI : MBB) { @@ -1514,6 +1543,8 @@ MFI->allocateWWMSpill(MF, MI.getOperand(0).getReg()); else if (MI.getOpcode() == AMDGPU::V_READLANE_B32) MFI->allocateWWMSpill(MF, MI.getOperand(1).getReg()); + else if (TII->isWWMRegSpillOpcode(MI.getOpcode())) + NeedExecCopyReservedReg = true; } } @@ -1526,7 +1557,7 @@ if (!ST.hasGFX90AInsts()) SavedVGPRs.clearBitsInMask(TRI->getAllAGPRRegMask()); - determinePrologEpilogSGPRSaves(MF, SavedVGPRs); + determinePrologEpilogSGPRSaves(MF, SavedVGPRs, NeedExecCopyReservedReg); // The Whole-Wave VGPRs need to be specially inserted in the prolog, so don't // allow the default insertion to handle them. diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -12510,6 +12510,14 @@ } } + // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling. + unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF); + Register SReg = + ST.isWave32() + ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1) + : AMDGPU::SGPR_64RegClass.getRegister((MaxNumSGPRs / 2) - 1); + Info->setSGPRForEXECCopy(SReg); + TargetLoweringBase::finalizeLowering(MF); } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -625,6 +625,11 @@ return get(Opcode).TSFlags & SIInstrFlags::SGPRSpill; } + static bool isWWMRegSpillOpcode(uint16_t Opcode) { + return Opcode == AMDGPU::SI_SPILL_WWM_V32_SAVE || + Opcode == AMDGPU::SI_SPILL_WWM_V32_RESTORE; + } + static bool isDPP(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::DPP; } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1511,6 +1511,29 @@ } } +static unsigned getWWMRegSpillSaveOpcode(unsigned Size) { + // Currently, there is only 32-bit WWM register spills needed. + if (Size != 4) + llvm_unreachable("unknown wwm register spill size"); + + return AMDGPU::SI_SPILL_WWM_V32_SAVE; +} + +static unsigned getVectorRegSpillSaveOpcode(Register Reg, + const TargetRegisterClass *RC, + unsigned Size, + const SIRegisterInfo &TRI, + const SIMachineFunctionInfo &MFI, + const MachineRegisterInfo &MRI) { + // Choose the right opcode if spilling a WWM register. + if (MFI.checkFlag(MRI, Reg, AMDGPU::VirtRegFlag::WWM_REG)) + return getWWMRegSpillSaveOpcode(Size); + + return TRI.isVectorSuperClass(RC) ? getAVSpillSaveOpcode(Size) + : TRI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(Size) + : getVGPRSpillSaveOpcode(Size); +} + void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, @@ -1557,9 +1580,8 @@ return; } - unsigned Opcode = RI.isVectorSuperClass(RC) ? getAVSpillSaveOpcode(SpillSize) - : RI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(SpillSize) - : getVGPRSpillSaveOpcode(SpillSize); + unsigned Opcode = + getVectorRegSpillSaveOpcode(SrcReg, RC, SpillSize, RI, *MFI, MRI); MFI->setHasSpilledVGPRs(); BuildMI(MBB, MI, DL, get(Opcode)) @@ -1678,6 +1700,29 @@ } } +static unsigned getWWMRegSpillRestoreOpcode(unsigned Size) { + // Currently, there is only 32-bit WWM register spills needed. + if (Size != 4) + llvm_unreachable("unknown wwm register spill size"); + + return AMDGPU::SI_SPILL_WWM_V32_RESTORE; +} + +static unsigned getVectorRegSpillRestoreOpcode(Register Reg, + const TargetRegisterClass *RC, + unsigned Size, + const SIRegisterInfo &TRI, + const SIMachineFunctionInfo &MFI, + const MachineRegisterInfo &MRI) { + // Choose the right opcode if restoring a WWM register. + if (MFI.checkFlag(MRI, Reg, AMDGPU::VirtRegFlag::WWM_REG)) + return getWWMRegSpillRestoreOpcode(Size); + + return TRI.isVectorSuperClass(RC) ? getAVSpillRestoreOpcode(Size) + : TRI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(Size) + : getVGPRSpillRestoreOpcode(Size); +} + void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, @@ -1686,6 +1731,7 @@ MachineFunction *MF = MBB.getParent(); SIMachineFunctionInfo *MFI = MF->getInfo(); MachineFrameInfo &FrameInfo = MF->getFrameInfo(); + MachineRegisterInfo &MRI = MF->getRegInfo(); const DebugLoc &DL = MBB.findDebugLoc(MI); unsigned SpillSize = TRI->getSpillSize(*RC); @@ -1706,7 +1752,6 @@ // lowered to non-memory instructions. const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize)); if (DestReg.isVirtual() && SpillSize == 4) { - MachineRegisterInfo &MRI = MF->getRegInfo(); MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass); } @@ -1720,10 +1765,9 @@ return; } - unsigned Opcode = RI.isVectorSuperClass(RC) - ? getAVSpillRestoreOpcode(SpillSize) - : RI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(SpillSize) - : getVGPRSpillRestoreOpcode(SpillSize); + unsigned Opcode = + getVectorRegSpillRestoreOpcode(DestReg, RC, SpillSize, RI, *MFI, MRI); + BuildMI(MBB, MI, DL, get(Opcode), DestReg) .addFrameIndex(FrameIndex) // vaddr .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -853,6 +853,8 @@ defm SI_SPILL_AV512 : SI_SPILL_VGPR ; defm SI_SPILL_AV1024 : SI_SPILL_VGPR ; +defm SI_SPILL_WWM_V32 : SI_SPILL_VGPR ; + def SI_PC_ADD_REL_OFFSET : SPseudoInstSI < (outs SReg_64:$dst), (ins si_ga:$ptr_lo, si_ga:$ptr_hi), diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp --- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp @@ -20,6 +20,7 @@ #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/InitializePasses.h" @@ -38,6 +39,7 @@ const SIInstrInfo *TII = nullptr; LiveIntervals *LIS = nullptr; SlotIndexes *Indexes = nullptr; + MachineDominatorTree *MDT = nullptr; // Save and Restore blocks of the current function. Typically there is a // single save block, unless Windows EH funclets are involved. @@ -51,13 +53,23 @@ void calculateSaveRestoreBlocks(MachineFunction &MF); bool spillCalleeSavedRegs(MachineFunction &MF); + void updateLaneVGPRDomInstr( + int FI, MachineBasicBlock *MBB, MachineBasicBlock::iterator InsertPt, + DenseMap &LaneVGPRDomInstr); bool runOnMachineFunction(MachineFunction &MF) override; void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); AU.setPreservesAll(); MachineFunctionPass::getAnalysisUsage(AU); } + + MachineFunctionProperties getClearedProperties() const override { + return MachineFunctionProperties() + .set(MachineFunctionProperties::Property::IsSSA) + .set(MachineFunctionProperties::Property::NoVRegs); + } }; } // end anonymous namespace @@ -68,6 +80,7 @@ "SI lower SGPR spill instructions", false, false) INITIALIZE_PASS_DEPENDENCY(LiveIntervals) INITIALIZE_PASS_DEPENDENCY(VirtRegMap) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_END(SILowerSGPRSpills, DEBUG_TYPE, "SI lower SGPR spill instructions", false, false) @@ -247,6 +260,55 @@ return false; } +void SILowerSGPRSpills::updateLaneVGPRDomInstr( + int FI, MachineBasicBlock *MBB, MachineBasicBlock::iterator InsertPt, + DenseMap &LaneVGPRDomInstr) { + // For the Def of a virtual LaneVPGR to dominate all its uses, we should + // insert an IMPLICIT_DEF before the dominating spill. Switching to a + // depth first order doesn't really help since the machine function can be in + // the unstructured control flow post-SSA. For each virtual register, hence + // finding the common dominator to get either the dominating spill or a block + // dominating all spills. Is there a better way to handle it? + SIMachineFunctionInfo *FuncInfo = + MBB->getParent()->getInfo(); + ArrayRef VGPRSpills = + FuncInfo->getSGPRSpillToVGPRLanes(FI); + Register PrevLaneVGPR; + for (auto &Spill : VGPRSpills) { + if (PrevLaneVGPR == Spill.VGPR) + continue; + + PrevLaneVGPR = Spill.VGPR; + auto I = LaneVGPRDomInstr.find(Spill.VGPR); + if (Spill.Lane == 0 && I == LaneVGPRDomInstr.end()) { + // Initially add the spill instruction itself for Insertion point. + LaneVGPRDomInstr[Spill.VGPR] = InsertPt; + } else { + assert(I != LaneVGPRDomInstr.end()); + auto PrevInsertPt = I->second; + MachineBasicBlock *DomMBB = PrevInsertPt->getParent(); + if (DomMBB == MBB) { + // The insertion point earlier selected in a predecessor block whose + // spills are currently being lowered. The earlier InsertPt would be + // the one just before the block terminator and it should be changed + // if we insert any new spill in it. + if (MDT->dominates(&*InsertPt, &*PrevInsertPt)) + I->second = InsertPt; + + continue; + } + + // Find the common dominator block between PrevInsertPt and the + // current spill. + DomMBB = MDT->findNearestCommonDominator(DomMBB, MBB); + if (DomMBB == MBB) + I->second = InsertPt; + else if (DomMBB != PrevInsertPt->getParent()) + I->second = &(*DomMBB->getFirstTerminator()); + } + } +} + bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget(); TII = ST.getInstrInfo(); @@ -254,6 +316,7 @@ LIS = getAnalysisIfAvailable(); Indexes = getAnalysisIfAvailable(); + MDT = &getAnalysis(); assert(SaveBlocks.empty() && RestoreBlocks.empty()); @@ -263,7 +326,6 @@ bool HasCSRs = spillCalleeSavedRegs(MF); MachineFrameInfo &MFI = MF.getFrameInfo(); - MachineRegisterInfo &MRI = MF.getRegInfo(); SIMachineFunctionInfo *FuncInfo = MF.getInfo(); if (!MFI.hasStackObjects() && !HasCSRs) { @@ -273,7 +335,6 @@ } bool MadeChange = false; - bool NewReservedRegs = false; // TODO: CSR VGPRs will never be spilled to AGPRs. These can probably be // handled as SpilledToReg in regular PrologEpilogInserter. @@ -289,6 +350,9 @@ // To track the spill frame indices handled in this pass. BitVector SpillFIs(MFI.getObjectIndexEnd(), false); + // To track the IMPLICIT_DEF insertion point for the lane vgprs. + DenseMap LaneVGPRDomInstr; + for (MachineBasicBlock &MBB : MF) { for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) { if (!TII->isSGPRSpill(MI)) @@ -296,23 +360,32 @@ int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex(); assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); + MachineInstrSpan MIS(&MI, &MBB); if (FuncInfo->allocateSGPRSpillToVGPRLane(MF, FI)) { - NewReservedRegs = true; bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex( MI, FI, nullptr, Indexes, LIS); (void)Spilled; assert(Spilled && "failed to spill SGPR to VGPR when allocated"); SpillFIs.set(FI); + updateLaneVGPRDomInstr(FI, &MBB, MIS.begin(), LaneVGPRDomInstr); } } } - // FIXME: Adding to live-ins redundant with reserving registers. - for (MachineBasicBlock &MBB : MF) { - for (auto Reg : FuncInfo->getSGPRSpillVGPRs()) - MBB.addLiveIn(Reg); - MBB.sortUniqueLiveIns(); + for (auto Reg : FuncInfo->getSGPRSpillVGPRs()) { + auto InsertPt = LaneVGPRDomInstr[Reg]; + // Insert the IMPLICIT_DEF at the identified points. + auto MIB = + BuildMI(*InsertPt->getParent(), *InsertPt, InsertPt->getDebugLoc(), + TII->get(AMDGPU::IMPLICIT_DEF), Reg); + FuncInfo->setFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG); + if (LIS) { + LIS->InsertMachineInstrInMaps(*MIB); + LIS->createAndComputeVirtRegInterval(Reg); + } + } + for (MachineBasicBlock &MBB : MF) { // FIXME: The dead frame indices are replaced with a null register from // the debug value instructions. We should instead, update it with the // correct register value. But not sure the register value alone is @@ -333,15 +406,26 @@ // lane". FuncInfo->removeDeadFrameIndices(MFI, /*ResetSGPRSpillStackIDs*/ false); + MachineRegisterInfo &MRI = MF.getRegInfo(); + const TargetRegisterClass *RC = + ST.isWave32() ? &AMDGPU::SGPR_32RegClass : &AMDGPU::SGPR_64RegClass; + // Shift back the reserved SGPR for EXEC copy into the lowest range. + // This SGPR is reserved to handle the whole-wave spill/copy operations + // that might get inserted during vgpr regalloc. + Register UnusedLowSGPR = TRI->findUnusedRegister(MRI, RC, MF); + if (UnusedLowSGPR && TRI->getHWRegIndex(UnusedLowSGPR) < + TRI->getHWRegIndex(FuncInfo->getSGPRForEXECCopy())) + FuncInfo->setSGPRForEXECCopy(UnusedLowSGPR); + MadeChange = true; + } else { + // No SGPR spills and hence there won't be any WWM spills/copies. Reset the + // SGPR reserved for EXEC copy. + FuncInfo->setSGPRForEXECCopy(AMDGPU::NoRegister); } SaveBlocks.clear(); RestoreBlocks.clear(); - // Updated the reserved registers with any VGPRs added for SGPR spills. - if (NewReservedRegs) - MRI.freezeReservedRegs(MF); - return MadeChange; } diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -300,6 +300,7 @@ SIMode Mode; Optional ScavengeFI; StringValue VGPRForAGPRCopy; + StringValue SGPRForEXECCopy; SIMachineFunctionInfo() = default; SIMachineFunctionInfo(const llvm::SIMachineFunctionInfo &, @@ -341,6 +342,8 @@ YamlIO.mapOptional("scavengeFI", MFI.ScavengeFI); YamlIO.mapOptional("vgprForAGPRCopy", MFI.VGPRForAGPRCopy, StringValue()); // Don't print out when it's empty. + YamlIO.mapOptional("sgprForEXECCopy", MFI.SGPRForEXECCopy, + StringValue()); // Don't print out when it's empty. } }; @@ -377,7 +380,8 @@ /// This class keeps track of the SPI_SP_INPUT_ADDR config register, which /// tells the hardware which interpolation parameters to load. -class SIMachineFunctionInfo final : public AMDGPUMachineFunction { +class SIMachineFunctionInfo final : public AMDGPUMachineFunction, + private MachineRegisterInfo::Delegate { friend class GCNTargetMachine; // State of MODE register, assumed FP mode. @@ -477,6 +481,9 @@ unsigned HighBitsOf32BitAddress; + // Flags associated with the virtual registers. + IndexedMap VRegFlags; + // Current recorded maximum possible occupancy. unsigned Occupancy; @@ -486,6 +493,10 @@ MCPhysReg getNextSystemSGPR() const; + // MachineRegisterInfo callback functions to notify events. + void MRI_NoteNewVirtualRegister(Register Reg) override; + void MRI_NotecloneVirtualRegister(Register NewReg, Register SrcReg) override; + public: struct VGPRSpillToAGPR { SmallVector Lanes; @@ -494,11 +505,11 @@ }; private: - // To track VGPR + lane index for each subregister of the SGPR spilled to - // frameindex key during SILowerSGPRSpills pass. + // To track virtual VGPR + lane index for each subregister of the SGPR spilled + // to frameindex key during SILowerSGPRSpills pass. DenseMap> SGPRSpillToVGPRLanes; - // To track VGPR + lane index for spilling special SGPRs like Frame Pointer - // identified during PrologEpilogInserter. + // To track physical VGPR + lane index for spilling special SGPRs like Frame + // Pointer identified during PrologEpilogInserter. DenseMap> PrologEpilogSGPRSpillToVGPRLanes; unsigned NumVGPRSpillLanes = 0; @@ -528,6 +539,9 @@ // PrologEpilogInserter. PrologEpilogSGPRSpillsMap PrologEpilogSGPRSpills; + // To save/restore EXEC MASK around WWM spills and copies. + Register SGPRForEXECCopy; + DenseMap VGPRToAGPRSpills; // AGPRs used for VGPR spills. @@ -651,6 +665,27 @@ : makeArrayRef(I->second); } + void setFlag(Register Reg, uint8_t Flag) { + assert(Reg.isVirtual()); + if (VRegFlags.inBounds(Reg)) + VRegFlags[Reg] |= (uint8_t)1 << Flag; + } + + bool checkFlag(const MachineRegisterInfo &MRI, Register Reg, + uint8_t Flag) const { + if (!Reg.isVirtual()) { + // See if a virtReg is available for the physReg. If found, check the + // flags of the virtual register. + Register VirtReg = MRI.getPhysToCurrentVirtReg(); + if (!VirtReg) + return false; + + Reg = VirtReg; + } + + return VRegFlags.inBounds(Reg) && VRegFlags[Reg] & ((uint8_t)1 << Flag); + } + void allocateWWMSpill(MachineFunction &MF, Register VGPR, uint64_t Size = 4, Align Alignment = Align(4)); @@ -663,6 +698,10 @@ return SpillAGPR; } + Register getSGPRForEXECCopy() const { return SGPRForEXECCopy; } + + void setSGPRForEXECCopy(Register Reg) { SGPRForEXECCopy = Reg; } + ArrayRef getVGPRSpillAGPRs() const { return SpillVGPR; } diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -62,6 +62,9 @@ Occupancy = ST.computeOccupancy(F, getLDSSize()); CallingConv::ID CC = F.getCallingConv(); + const_cast(MF).getRegInfo().addDelegate(this); + VRegFlags.reserve(256); + // FIXME: Should have analysis or something rather than attribute to detect // calls. const bool HasCalls = F.hasFnAttribute("amdgpu-calls"); @@ -309,24 +312,11 @@ bool SIMachineFunctionInfo::allocateVGPRForSGPRSpills(MachineFunction &MF, int FI, unsigned LaneIndex) { - const GCNSubtarget &ST = MF.getSubtarget(); - const SIRegisterInfo *TRI = ST.getRegisterInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); Register LaneVGPR; if (!LaneIndex) { - LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF); - if (LaneVGPR == AMDGPU::NoRegister) { - // We have no VGPRs left for spilling SGPRs. Reset because we will not - // partially spill the SGPR to VGPRs. - SGPRSpillToVGPRLanes.erase(FI); - return false; - } - + LaneVGPR = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); SpillVGPRs.push_back(LaneVGPR); - // Add this register as live-in to all blocks to avoid machine verifier - // complaining about use of an undefined physical register. - for (MachineBasicBlock &BB : MF) - BB.addLiveIn(LaneVGPR); } else { LaneVGPR = SpillVGPRs.back(); } @@ -533,6 +523,16 @@ return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs; } +void SIMachineFunctionInfo::MRI_NoteNewVirtualRegister(Register Reg) { + VRegFlags.grow(Reg); +} + +void SIMachineFunctionInfo::MRI_NotecloneVirtualRegister(Register NewReg, + Register SrcReg) { + VRegFlags.grow(NewReg); + VRegFlags[NewReg] = VRegFlags[SrcReg]; +} + Register SIMachineFunctionInfo::getGITPtrLoReg(const MachineFunction &MF) const { const GCNSubtarget &ST = MF.getSubtarget(); @@ -640,6 +640,10 @@ if (MFI.getVGPRForAGPRCopy()) VGPRForAGPRCopy = regToString(MFI.getVGPRForAGPRCopy(), TRI); + + if (MFI.getSGPRForEXECCopy()) + SGPRForEXECCopy = regToString(MFI.getSGPRForEXECCopy(), TRI); + auto SFI = MFI.getOptionalScavengeFI(); if (SFI) ScavengeFI = yaml::FrameIndex(*SFI, MF.getFrameInfo()); diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -646,6 +646,11 @@ assert(!isSubRegister(ScratchRSrcReg, BasePtrReg)); } + // SGPR used to preserve EXEC MASK around WWM spill/copy instructions. + Register ExecCopyReg = MFI->getSGPRForEXECCopy(); + if (ExecCopyReg) + reserveRegisterTuples(Reserved, ExecCopyReg); + // Reserve VGPRs/AGPRs. // unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF); @@ -711,9 +716,6 @@ for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs()) reserveRegisterTuples(Reserved, Reg); - for (auto Reg : MFI->getSGPRSpillVGPRs()) - reserveRegisterTuples(Reserved, Reg); - return Reserved; } @@ -1029,6 +1031,8 @@ case AMDGPU::SI_SPILL_A32_RESTORE: case AMDGPU::SI_SPILL_AV32_SAVE: case AMDGPU::SI_SPILL_AV32_RESTORE: + case AMDGPU::SI_SPILL_WWM_V32_SAVE: + case AMDGPU::SI_SPILL_WWM_V32_RESTORE: return 1; default: llvm_unreachable("Invalid spill opcode"); } @@ -1961,6 +1965,40 @@ } } +static void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, Register Reg, + RegScavenger *RS) { + const GCNSubtarget &ST = MF.getSubtarget(); + const SIInstrInfo *TII = ST.getInstrInfo(); + bool IsWave32 = ST.isWave32(); + if (RS->isRegUsed(AMDGPU::SCC)) { + // Insert two move instructions, one to save the original value of EXEC and + // the other to turn on all bits in EXEC. This is required as we can't use + // the single instruction S_OR_SAVEEXEC that clobbers SCC. + unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + MCRegister Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Reg).addReg(Exec, RegState::Kill); + BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1); + } else { + const unsigned OrSaveExec = + IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64; + auto SaveExec = + BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), Reg).addImm(-1); + SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead. + } +} + +static void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, const DebugLoc &DL, + Register Reg) { + const GCNSubtarget &ST = MF.getSubtarget(); + const SIInstrInfo *TII = ST.getInstrInfo(); + unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec).addReg(Reg, RegState::Kill); +} + bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const { @@ -2039,7 +2077,8 @@ case AMDGPU::SI_SPILL_AV128_SAVE: case AMDGPU::SI_SPILL_AV96_SAVE: case AMDGPU::SI_SPILL_AV64_SAVE: - case AMDGPU::SI_SPILL_AV32_SAVE: { + case AMDGPU::SI_SPILL_AV32_SAVE: + case AMDGPU::SI_SPILL_WWM_V32_SAVE: { const MachineOperand *VData = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == @@ -2048,11 +2087,18 @@ unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR : AMDGPU::BUFFER_STORE_DWORD_OFFSET; auto *MBB = MI->getParent(); + bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode()); + if (IsWWMRegSpill) + insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(), RS); + buildSpillLoadStore( *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg, TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), *MI->memoperands_begin(), RS); MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode())); + if (IsWWMRegSpill) + restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy()); + MI->eraseFromParent(); return true; } @@ -2085,7 +2131,8 @@ case AMDGPU::SI_SPILL_AV224_RESTORE: case AMDGPU::SI_SPILL_AV256_RESTORE: case AMDGPU::SI_SPILL_AV512_RESTORE: - case AMDGPU::SI_SPILL_AV1024_RESTORE: { + case AMDGPU::SI_SPILL_AV1024_RESTORE: + case AMDGPU::SI_SPILL_WWM_V32_RESTORE: { const MachineOperand *VData = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == @@ -2094,10 +2141,17 @@ unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; auto *MBB = MI->getParent(); + bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode()); + if (IsWWMRegSpill) + insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(), RS); + buildSpillLoadStore( *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg, TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), *MI->memoperands_begin(), RS); + if (IsWWMRegSpill) + restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy()); + MI->eraseFromParent(); return true; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll @@ -13,6 +13,7 @@ ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[18:19] +; CHECK-NEXT: ; implicit-def: $vgpr40 ; CHECK-NEXT: s_addk_i32 s32, 0x400 ; CHECK-NEXT: v_writelane_b32 v40, s30, 0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll @@ -246,6 +246,7 @@ ; MUBUF-NEXT: v_mov_b32_e32 v0, 9 ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; MUBUF-NEXT: v_mov_b32_e32 v0, 10 +; MUBUF-NEXT: ; implicit-def: $vgpr40 ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; MUBUF-NEXT: v_mov_b32_e32 v0, 11 ; MUBUF-NEXT: v_writelane_b32 v40, s30, 0 @@ -283,6 +284,7 @@ ; FLATSCR-NEXT: v_mov_b32_e32 v0, 9 ; FLATSCR-NEXT: scratch_store_dword off, v0, s32 offset:4 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 10 +; FLATSCR-NEXT: ; implicit-def: $vgpr40 ; FLATSCR-NEXT: scratch_store_dword off, v0, s32 offset:8 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 11 ; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0 @@ -323,8 +325,9 @@ ; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen ; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:4 ; MUBUF-NEXT: s_addk_i32 s32, 0x400 -; MUBUF-NEXT: v_writelane_b32 v40, s30, 0 +; MUBUF-NEXT: ; implicit-def: $vgpr40 ; MUBUF-NEXT: v_writelane_b32 v41, s4, 0 +; MUBUF-NEXT: v_writelane_b32 v40, s30, 0 ; MUBUF-NEXT: v_writelane_b32 v40, s31, 1 ; MUBUF-NEXT: s_getpc_b64 s[4:5] ; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_byval@rel32@lo+4 @@ -406,8 +409,9 @@ ; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] ; FLATSCR-NEXT: scratch_load_dwordx2 v[1:2], v0, off ; FLATSCR-NEXT: s_add_i32 s32, s32, 16 -; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0 +; FLATSCR-NEXT: ; implicit-def: $vgpr40 ; FLATSCR-NEXT: v_writelane_b32 v41, s0, 0 +; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0 ; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1 ; FLATSCR-NEXT: s_getpc_b64 s[0:1] ; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_byval@rel32@lo+4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll @@ -8,75 +8,83 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 ; CHECK-NEXT: s_xor_saveexec_b32 s4, -1 -; CHECK-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b32 exec_lo, s4 -; CHECK-NEXT: v_mov_b32_e32 v15, v1 -; CHECK-NEXT: v_mov_b32_e32 v14, v2 -; CHECK-NEXT: v_mov_b32_e32 v13, v3 -; CHECK-NEXT: v_mov_b32_e32 v12, v4 -; CHECK-NEXT: v_mov_b32_e32 v11, v5 -; CHECK-NEXT: v_mov_b32_e32 v10, v6 -; CHECK-NEXT: v_mov_b32_e32 v9, v7 +; CHECK-NEXT: v_mov_b32_e32 v14, v1 +; CHECK-NEXT: v_mov_b32_e32 v13, v2 +; CHECK-NEXT: v_mov_b32_e32 v12, v3 +; CHECK-NEXT: v_mov_b32_e32 v11, v4 +; CHECK-NEXT: v_mov_b32_e32 v10, v5 +; CHECK-NEXT: v_mov_b32_e32 v9, v6 +; CHECK-NEXT: v_mov_b32_e32 v8, v7 ; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 killed $exec -; CHECK-NEXT: v_mov_b32_e32 v1, v15 -; CHECK-NEXT: v_mov_b32_e32 v2, v14 -; CHECK-NEXT: v_mov_b32_e32 v3, v13 -; CHECK-NEXT: v_mov_b32_e32 v4, v12 -; CHECK-NEXT: v_mov_b32_e32 v5, v11 -; CHECK-NEXT: v_mov_b32_e32 v6, v10 -; CHECK-NEXT: v_mov_b32_e32 v7, v9 -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; CHECK-NEXT: v_mov_b32_e32 v1, v14 +; CHECK-NEXT: v_mov_b32_e32 v2, v13 +; CHECK-NEXT: v_mov_b32_e32 v3, v12 +; CHECK-NEXT: v_mov_b32_e32 v4, v11 +; CHECK-NEXT: v_mov_b32_e32 v5, v10 +; CHECK-NEXT: v_mov_b32_e32 v6, v9 +; CHECK-NEXT: v_mov_b32_e32 v7, v8 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b32 s8, 0 ; CHECK-NEXT: s_mov_b32 s4, s8 ; CHECK-NEXT: s_mov_b32 s5, s8 ; CHECK-NEXT: s_mov_b32 s6, s8 ; CHECK-NEXT: s_mov_b32 s7, s8 -; CHECK-NEXT: v_writelane_b32 v8, s4, 0 -; CHECK-NEXT: v_writelane_b32 v8, s5, 1 -; CHECK-NEXT: v_writelane_b32 v8, s6, 2 -; CHECK-NEXT: v_writelane_b32 v8, s7, 3 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: v_writelane_b32 v0, s4, 0 +; CHECK-NEXT: v_writelane_b32 v0, s5, 1 +; CHECK-NEXT: v_writelane_b32 v0, s6, 2 +; CHECK-NEXT: v_writelane_b32 v0, s7, 3 ; CHECK-NEXT: s_mov_b32 s6, 0 ; CHECK-NEXT: s_mov_b32 s4, s6 ; CHECK-NEXT: s_mov_b32 s5, s6 -; CHECK-NEXT: v_mov_b32_e32 v0, s4 -; CHECK-NEXT: v_mov_b32_e32 v1, s5 -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; CHECK-NEXT: v_mov_b32_e32 v1, s4 +; CHECK-NEXT: v_mov_b32_e32 v2, s5 ; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b32 s4, exec_lo -; CHECK-NEXT: v_writelane_b32 v8, s4, 4 +; CHECK-NEXT: v_writelane_b32 v0, s4, 4 +; CHECK-NEXT: s_or_saveexec_b32 s21, -1 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b32 exec_lo, s21 ; CHECK-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; CHECK-NEXT: s_or_saveexec_b32 s21, -1 +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b32 exec_lo, s21 +; CHECK-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v7, v9 -; CHECK-NEXT: v_mov_b32_e32 v6, v10 -; CHECK-NEXT: v_mov_b32_e32 v5, v11 -; CHECK-NEXT: v_mov_b32_e32 v4, v12 -; CHECK-NEXT: v_mov_b32_e32 v3, v13 -; CHECK-NEXT: v_mov_b32_e32 v2, v14 -; CHECK-NEXT: v_mov_b32_e32 v1, v15 -; CHECK-NEXT: v_mov_b32_e32 v0, v16 -; CHECK-NEXT: v_readfirstlane_b32 s12, v7 -; CHECK-NEXT: v_readfirstlane_b32 s10, v6 -; CHECK-NEXT: v_readfirstlane_b32 s9, v5 -; CHECK-NEXT: v_readfirstlane_b32 s8, v4 -; CHECK-NEXT: v_readfirstlane_b32 s7, v3 -; CHECK-NEXT: v_readfirstlane_b32 s6, v2 -; CHECK-NEXT: v_readfirstlane_b32 s5, v1 -; CHECK-NEXT: v_readfirstlane_b32 s4, v0 +; CHECK-NEXT: v_mov_b32_e32 v8, v9 +; CHECK-NEXT: v_mov_b32_e32 v7, v10 +; CHECK-NEXT: v_mov_b32_e32 v6, v11 +; CHECK-NEXT: v_mov_b32_e32 v5, v12 +; CHECK-NEXT: v_mov_b32_e32 v4, v13 +; CHECK-NEXT: v_mov_b32_e32 v3, v14 +; CHECK-NEXT: v_mov_b32_e32 v2, v15 +; CHECK-NEXT: v_mov_b32_e32 v1, v16 +; CHECK-NEXT: v_readfirstlane_b32 s12, v8 +; CHECK-NEXT: v_readfirstlane_b32 s10, v7 +; CHECK-NEXT: v_readfirstlane_b32 s9, v6 +; CHECK-NEXT: v_readfirstlane_b32 s8, v5 +; CHECK-NEXT: v_readfirstlane_b32 s7, v4 +; CHECK-NEXT: v_readfirstlane_b32 s6, v3 +; CHECK-NEXT: v_readfirstlane_b32 s5, v2 +; CHECK-NEXT: v_readfirstlane_b32 s4, v1 ; CHECK-NEXT: ; kill: def $sgpr12 killed $sgpr12 def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 ; CHECK-NEXT: s_mov_b32 s13, s10 ; CHECK-NEXT: s_mov_b32 s14, s9 @@ -85,68 +93,79 @@ ; CHECK-NEXT: s_mov_b32 s17, s6 ; CHECK-NEXT: s_mov_b32 s18, s5 ; CHECK-NEXT: s_mov_b32 s19, s4 -; CHECK-NEXT: v_writelane_b32 v8, s12, 5 -; CHECK-NEXT: v_writelane_b32 v8, s13, 6 -; CHECK-NEXT: v_writelane_b32 v8, s14, 7 -; CHECK-NEXT: v_writelane_b32 v8, s15, 8 -; CHECK-NEXT: v_writelane_b32 v8, s16, 9 -; CHECK-NEXT: v_writelane_b32 v8, s17, 10 -; CHECK-NEXT: v_writelane_b32 v8, s18, 11 -; CHECK-NEXT: v_writelane_b32 v8, s19, 12 -; CHECK-NEXT: v_mov_b32_e32 v6, v9 -; CHECK-NEXT: v_mov_b32_e32 v7, v10 -; CHECK-NEXT: v_mov_b32_e32 v4, v11 -; CHECK-NEXT: v_mov_b32_e32 v5, v12 -; CHECK-NEXT: v_mov_b32_e32 v2, v13 -; CHECK-NEXT: v_mov_b32_e32 v3, v14 -; CHECK-NEXT: v_mov_b32_e32 v0, v15 -; CHECK-NEXT: v_mov_b32_e32 v1, v16 +; CHECK-NEXT: v_writelane_b32 v0, s12, 5 +; CHECK-NEXT: v_writelane_b32 v0, s13, 6 +; CHECK-NEXT: v_writelane_b32 v0, s14, 7 +; CHECK-NEXT: v_writelane_b32 v0, s15, 8 +; CHECK-NEXT: v_writelane_b32 v0, s16, 9 +; CHECK-NEXT: v_writelane_b32 v0, s17, 10 +; CHECK-NEXT: v_writelane_b32 v0, s18, 11 +; CHECK-NEXT: v_writelane_b32 v0, s19, 12 +; CHECK-NEXT: v_mov_b32_e32 v7, v9 +; CHECK-NEXT: v_mov_b32_e32 v8, v10 +; CHECK-NEXT: v_mov_b32_e32 v5, v11 +; CHECK-NEXT: v_mov_b32_e32 v6, v12 +; CHECK-NEXT: v_mov_b32_e32 v3, v13 +; CHECK-NEXT: v_mov_b32_e32 v4, v14 +; CHECK-NEXT: v_mov_b32_e32 v1, v15 +; CHECK-NEXT: v_mov_b32_e32 v2, v16 ; CHECK-NEXT: s_mov_b64 s[4:5], s[12:13] ; CHECK-NEXT: s_mov_b64 s[10:11], s[14:15] ; CHECK-NEXT: s_mov_b64 s[8:9], s[16:17] ; CHECK-NEXT: s_mov_b64 s[6:7], s[18:19] -; CHECK-NEXT: v_cmp_eq_u64_e64 s4, s[4:5], v[6:7] -; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[10:11], v[4:5] +; CHECK-NEXT: v_cmp_eq_u64_e64 s4, s[4:5], v[7:8] +; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[10:11], v[5:6] ; CHECK-NEXT: s_and_b32 s4, s4, s5 -; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[8:9], v[2:3] +; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[8:9], v[3:4] ; CHECK-NEXT: s_and_b32 s4, s4, s5 -; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[6:7], v[0:1] +; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[6:7], v[1:2] ; CHECK-NEXT: s_and_b32 s4, s4, s5 ; CHECK-NEXT: s_and_saveexec_b32 s4, s4 -; CHECK-NEXT: v_writelane_b32 v8, s4, 13 +; CHECK-NEXT: v_writelane_b32 v0, s4, 13 +; CHECK-NEXT: s_or_saveexec_b32 s21, -1 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b32 exec_lo, s21 ; CHECK-NEXT: ; %bb.2: ; in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: v_readlane_b32 s4, v8, 13 -; CHECK-NEXT: v_readlane_b32 s8, v8, 5 -; CHECK-NEXT: v_readlane_b32 s9, v8, 6 -; CHECK-NEXT: v_readlane_b32 s10, v8, 7 -; CHECK-NEXT: v_readlane_b32 s11, v8, 8 -; CHECK-NEXT: v_readlane_b32 s12, v8, 9 -; CHECK-NEXT: v_readlane_b32 s13, v8, 10 -; CHECK-NEXT: v_readlane_b32 s14, v8, 11 -; CHECK-NEXT: v_readlane_b32 s15, v8, 12 -; CHECK-NEXT: v_readlane_b32 s16, v8, 0 -; CHECK-NEXT: v_readlane_b32 s17, v8, 1 -; CHECK-NEXT: v_readlane_b32 s18, v8, 2 -; CHECK-NEXT: v_readlane_b32 s19, v8, 3 +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; CHECK-NEXT: s_or_saveexec_b32 s21, -1 +; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b32 exec_lo, s21 ; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_readlane_b32 s4, v2, 13 +; CHECK-NEXT: v_readlane_b32 s8, v2, 5 +; CHECK-NEXT: v_readlane_b32 s9, v2, 6 +; CHECK-NEXT: v_readlane_b32 s10, v2, 7 +; CHECK-NEXT: v_readlane_b32 s11, v2, 8 +; CHECK-NEXT: v_readlane_b32 s12, v2, 9 +; CHECK-NEXT: v_readlane_b32 s13, v2, 10 +; CHECK-NEXT: v_readlane_b32 s14, v2, 11 +; CHECK-NEXT: v_readlane_b32 s15, v2, 12 +; CHECK-NEXT: v_readlane_b32 s16, v2, 0 +; CHECK-NEXT: v_readlane_b32 s17, v2, 1 +; CHECK-NEXT: v_readlane_b32 s18, v2, 2 +; CHECK-NEXT: v_readlane_b32 s19, v2, 3 ; CHECK-NEXT: image_sample v0, v[0:1], s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_2D ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; CHECK-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; CHECK-NEXT: s_cbranch_execnz .LBB0_1 ; CHECK-NEXT: ; %bb.3: -; CHECK-NEXT: v_readlane_b32 s4, v8, 4 +; CHECK-NEXT: s_or_saveexec_b32 s21, -1 +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b32 exec_lo, s21 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_readlane_b32 s4, v0, 4 ; CHECK-NEXT: s_mov_b32 exec_lo, s4 ; CHECK-NEXT: ; %bb.4: -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; CHECK-NEXT: ; implicit-def: $sgpr4 ; CHECK-NEXT: v_mov_b32_e32 v1, s4 ; CHECK-NEXT: v_mov_b32_e32 v2, s4 ; CHECK-NEXT: v_mov_b32_e32 v3, s4 ; CHECK-NEXT: s_xor_saveexec_b32 s4, -1 -; CHECK-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b32 exec_lo, s4 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll @@ -242,8 +242,9 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dword v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s16, 0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], 0 diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll --- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll +++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll @@ -23,6 +23,7 @@ ; FIXEDABI-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; FIXEDABI-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; FIXEDABI-NEXT: s_mov_b64 exec, s[18:19] +; FIXEDABI-NEXT: ; implicit-def: $vgpr40 ; FIXEDABI-NEXT: s_addk_i32 s32, 0x400 ; FIXEDABI-NEXT: v_writelane_b32 v40, s30, 0 ; FIXEDABI-NEXT: v_writelane_b32 v41, s16, 0 diff --git a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll --- a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll @@ -902,6 +902,11 @@ ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[4:5] +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: ; implicit-def: $vgpr1 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: s_mov_b32 s0, 0 +; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: s_waitcnt expcnt(1) ; CHECK-NEXT: v_writelane_b32 v0, s30, 0 ; CHECK-NEXT: v_writelane_b32 v0, s31, 1 @@ -978,9 +983,6 @@ ; CHECK-NEXT: v_writelane_b32 v1, s101, 6 ; CHECK-NEXT: s_cmp_eq_u32 s31, 0 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: s_mov_b32 s0, 0 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: s_mov_b32 s1, 0 ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll --- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll @@ -8,12 +8,12 @@ @alias = hidden alias void (), void ()* @aliasee_default ; ALL-LABEL: {{^}}kernel: -; GFX908: .amdhsa_next_free_vgpr 41 +; GFX908: .amdhsa_next_free_vgpr 32 ; GFX908-NEXT: .amdhsa_next_free_sgpr 33 -; GFX90A: .amdhsa_next_free_vgpr 71 +; GFX90A: .amdhsa_next_free_vgpr 59 ; GFX90A-NEXT: .amdhsa_next_free_sgpr 33 -; GFX90A-NEXT: .amdhsa_accum_offset 44 +; GFX90A-NEXT: .amdhsa_accum_offset 32 define amdgpu_kernel void @kernel() #0 { bb: call void @alias() #2 diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll --- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll +++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll @@ -9,7 +9,7 @@ ; The parent kernel has a higher VGPR usage than the possible callees. ; CHECK-LABEL: {{^}}kernel1: -; CHECK: .amdhsa_next_free_vgpr 42 +; CHECK: .amdhsa_next_free_vgpr 41 ; CHECK-NEXT: .amdhsa_next_free_sgpr 33 define amdgpu_kernel void @kernel1() #0 { bb: diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll --- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll @@ -178,7 +178,7 @@ ; restored. No FP is required. ; ; GCN-LABEL: {{^}}callee_func_sgpr_spill_no_calls: -; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} +; GCN: s_xor_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 ; 4-byte Folded Spill ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] @@ -189,7 +189,7 @@ ; GCN: v_readlane_b32 s{{[0-9]+}}, [[CSR_VGPR]] ; GCN: v_readlane_b32 s{{[0-9]+}}, [[CSR_VGPR]] -; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} +; GCN: s_xor_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 ; 4-byte Folded Reload ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] @@ -227,6 +227,7 @@ ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; FLATSCR-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, +; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: v_writelane_b32 v0, s42, 0 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; clobber s42 @@ -400,8 +401,9 @@ ; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN: v_writelane_b32 [[CSR_VGPR]], s30, 0 +; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 +; GCN: v_writelane_b32 [[CSR_VGPR]], s30, 0 ; GCN: v_writelane_b32 [[CSR_VGPR]], s31, 1 ; MUBUF: buffer_store_dword [[ZERO]], off, s[0:3], s33{{$}} ; FLATSCR: scratch_store_dword off, [[ZERO]], s33{{$}} @@ -439,10 +441,11 @@ ; GCN: s_waitcnt ; GCN-NEXT: s_mov_b32 vcc_lo, s33 ; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} +; GCN-NEXT: s_xor_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] +; GCN-NEXT: ; implicit-def: $vgpr48 ; MUBUF-DAG: buffer_store_dword ; FLATSCR-DAG: scratch_store_dword @@ -450,7 +453,7 @@ ; FLATSCR: s_add_i32 s32, s32, 12{{$}} ; GCN: ;;#ASMSTART -; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} +; GCN: s_xor_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] @@ -485,7 +488,7 @@ ; GCN: s_waitcnt ; GCN-NEXT: s_mov_b32 vcc_lo, s33 ; GCN-DAG: s_mov_b32 s33, s32 -; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} +; GCN-NEXT: s_xor_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s33, 0x40100 ; FLATSCR-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s33, 0x1004 ; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Spill @@ -497,7 +500,7 @@ ; FLATSCR-DAG: scratch_store_dword ; GCN: ;;#ASMSTART -; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} +; GCN: s_xor_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s33, 0x40100 ; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Reload ; FLATSCR-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s33, 0x1004 @@ -591,7 +594,7 @@ ; VGPR. ; GCN-LABEL: {{^}}callee_need_to_spill_fp_to_memory_full_reserved_vgpr: ; MUBUF: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 -; FLATSCR: s_mov_b32 s33, s0 +; FLATSCR: s_mov_b32 s33, s2 ; MUBUF: s_mov_b32 s33, s32 ; MUBUF: s_xor_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF: s_mov_b64 exec, [[COPY_EXEC1]] @@ -632,14 +635,14 @@ ; Make sure that the FP save happens after restoring exec from the same ; register. ; GCN-LABEL: {{^}}callee_need_to_spill_fp_to_reg: -; FLATSCR: s_mov_b32 s0, s33 +; FLATSCR: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 ; FLATSCR: s_mov_b32 s33, s32 ; GCN-NOT: v_writelane_b32 v40, s33 -; FLATSCR: s_or_saveexec_b64 s[2:3], -1 -; FLATSCR: s_mov_b64 exec, s[2:3] -; FLATSCR: s_or_saveexec_b64 s[2:3], -1 +; FLATSCR: s_xor_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} +; FLATSCR: s_mov_b64 exec, [[COPY_EXEC0]] +; FLATSCR: s_xor_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GCN-NOT: v_readlane_b32 s33, v40 -; FLATSCR: s_mov_b32 s33, s0 +; FLATSCR: s_mov_b32 s33, [[FP_SCRATCH_COPY]] ; GCN: s_setpc_b64 define void @callee_need_to_spill_fp_to_reg() #1 { call void asm sideeffect "; clobber nonpreserved SGPRs and 64 CSRs", @@ -672,7 +675,7 @@ ; MUBUF-NEXT: buffer_store_dword v39, off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Spill ; MUBUF: v_mov_b32_e32 v0, [[FP_SCRATCH_COPY]] ; GCN-NOT: v_mov_b32_e32 v0, 0x100c -; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s33, 0x40200 +; MUBUF: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s33, 0x40200 ; MUBUF: buffer_store_dword v0, off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Spill ; FLATSCR: v_mov_b32_e32 v0, 0 ; FLATSCR: s_add_i32 [[SOFF:s[0-9]+]], s33, 0x1000 diff --git a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll --- a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll @@ -30,7 +30,14 @@ ; ; GCN_DBG-LABEL: test_loop: ; GCN_DBG: ; %bb.0: ; %entry +; GCN_DBG-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GCN_DBG-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GCN_DBG-NEXT: s_mov_b32 s10, -1 +; GCN_DBG-NEXT: s_mov_b32 s11, 0xe8f000 +; GCN_DBG-NEXT: s_add_u32 s8, s8, s3 +; GCN_DBG-NEXT: s_addc_u32 s9, s9, 0 ; GCN_DBG-NEXT: s_load_dword s2, s[0:1], 0x9 +; GCN_DBG-NEXT: ; implicit-def: $vgpr0 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) ; GCN_DBG-NEXT: v_writelane_b32 v0, s2, 0 ; GCN_DBG-NEXT: s_load_dword s1, s[0:1], 0xa @@ -39,11 +46,20 @@ ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) ; GCN_DBG-NEXT: s_cmp_lg_u32 s1, s2 ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 +; GCN_DBG-NEXT: s_mov_b64 s[4:5], exec +; GCN_DBG-NEXT: s_mov_b64 exec, -1 +; GCN_DBG-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill +; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] ; GCN_DBG-NEXT: s_cbranch_scc1 .LBB0_2 ; GCN_DBG-NEXT: ; %bb.1: ; %for.exit ; GCN_DBG-NEXT: s_endpgm ; GCN_DBG-NEXT: .LBB0_2: ; %for.body ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN_DBG-NEXT: s_waitcnt expcnt(0) +; GCN_DBG-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload +; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] +; GCN_DBG-NEXT: s_waitcnt vmcnt(0) ; GCN_DBG-NEXT: v_readlane_b32 s0, v0, 1 ; GCN_DBG-NEXT: v_readlane_b32 s2, v0, 0 ; GCN_DBG-NEXT: s_mov_b32 s1, 2 @@ -65,6 +81,9 @@ ; GCN_DBG-NEXT: s_mov_b64 s[2:3], -1 ; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3] ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 +; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN_DBG-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill +; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] ; GCN_DBG-NEXT: s_cbranch_vccnz .LBB0_2 ; GCN_DBG-NEXT: ; %bb.3: ; %DummyReturnBlock ; GCN_DBG-NEXT: s_endpgm @@ -105,16 +124,31 @@ ; ; GCN_DBG-LABEL: loop_const_true: ; GCN_DBG: ; %bb.0: ; %entry +; GCN_DBG-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GCN_DBG-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GCN_DBG-NEXT: s_mov_b32 s10, -1 +; GCN_DBG-NEXT: s_mov_b32 s11, 0xe8f000 +; GCN_DBG-NEXT: s_add_u32 s8, s8, s3 +; GCN_DBG-NEXT: s_addc_u32 s9, s9, 0 ; GCN_DBG-NEXT: s_load_dword s0, s[0:1], 0x9 +; GCN_DBG-NEXT: ; implicit-def: $vgpr0 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 0 ; GCN_DBG-NEXT: s_mov_b32 s0, 0 ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 +; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN_DBG-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill +; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] ; GCN_DBG-NEXT: s_branch .LBB1_2 ; GCN_DBG-NEXT: .LBB1_1: ; %for.exit ; GCN_DBG-NEXT: s_endpgm ; GCN_DBG-NEXT: .LBB1_2: ; %for.body ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN_DBG-NEXT: s_waitcnt expcnt(0) +; GCN_DBG-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload +; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] +; GCN_DBG-NEXT: s_waitcnt vmcnt(0) ; GCN_DBG-NEXT: v_readlane_b32 s0, v0, 1 ; GCN_DBG-NEXT: v_readlane_b32 s2, v0, 0 ; GCN_DBG-NEXT: s_mov_b32 s1, 2 @@ -136,6 +170,9 @@ ; GCN_DBG-NEXT: s_mov_b64 s[2:3], 0 ; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3] ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 +; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN_DBG-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill +; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] ; GCN_DBG-NEXT: s_cbranch_vccnz .LBB1_1 ; GCN_DBG-NEXT: s_branch .LBB1_2 entry: @@ -170,16 +207,31 @@ ; ; GCN_DBG-LABEL: loop_const_false: ; GCN_DBG: ; %bb.0: ; %entry +; GCN_DBG-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GCN_DBG-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GCN_DBG-NEXT: s_mov_b32 s10, -1 +; GCN_DBG-NEXT: s_mov_b32 s11, 0xe8f000 +; GCN_DBG-NEXT: s_add_u32 s8, s8, s3 +; GCN_DBG-NEXT: s_addc_u32 s9, s9, 0 ; GCN_DBG-NEXT: s_load_dword s0, s[0:1], 0x9 +; GCN_DBG-NEXT: ; implicit-def: $vgpr0 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 0 ; GCN_DBG-NEXT: s_mov_b32 s0, 0 ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 +; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN_DBG-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill +; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] ; GCN_DBG-NEXT: s_branch .LBB2_2 ; GCN_DBG-NEXT: .LBB2_1: ; %for.exit ; GCN_DBG-NEXT: s_endpgm ; GCN_DBG-NEXT: .LBB2_2: ; %for.body ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN_DBG-NEXT: s_waitcnt expcnt(0) +; GCN_DBG-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload +; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] +; GCN_DBG-NEXT: s_waitcnt vmcnt(0) ; GCN_DBG-NEXT: v_readlane_b32 s0, v0, 1 ; GCN_DBG-NEXT: v_readlane_b32 s2, v0, 0 ; GCN_DBG-NEXT: s_mov_b32 s1, 2 @@ -201,6 +253,9 @@ ; GCN_DBG-NEXT: s_mov_b64 s[2:3], -1 ; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3] ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 +; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN_DBG-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill +; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] ; GCN_DBG-NEXT: s_cbranch_vccnz .LBB2_1 ; GCN_DBG-NEXT: s_branch .LBB2_2 entry: @@ -236,16 +291,31 @@ ; ; GCN_DBG-LABEL: loop_const_undef: ; GCN_DBG: ; %bb.0: ; %entry +; GCN_DBG-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GCN_DBG-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GCN_DBG-NEXT: s_mov_b32 s10, -1 +; GCN_DBG-NEXT: s_mov_b32 s11, 0xe8f000 +; GCN_DBG-NEXT: s_add_u32 s8, s8, s3 +; GCN_DBG-NEXT: s_addc_u32 s9, s9, 0 ; GCN_DBG-NEXT: s_load_dword s0, s[0:1], 0x9 +; GCN_DBG-NEXT: ; implicit-def: $vgpr0 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 0 ; GCN_DBG-NEXT: s_mov_b32 s0, 0 ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 +; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN_DBG-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill +; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] ; GCN_DBG-NEXT: s_branch .LBB3_2 ; GCN_DBG-NEXT: .LBB3_1: ; %for.exit ; GCN_DBG-NEXT: s_endpgm ; GCN_DBG-NEXT: .LBB3_2: ; %for.body ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN_DBG-NEXT: s_waitcnt expcnt(0) +; GCN_DBG-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload +; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] +; GCN_DBG-NEXT: s_waitcnt vmcnt(0) ; GCN_DBG-NEXT: v_readlane_b32 s0, v0, 1 ; GCN_DBG-NEXT: v_readlane_b32 s2, v0, 0 ; GCN_DBG-NEXT: s_mov_b32 s1, 2 @@ -265,6 +335,9 @@ ; GCN_DBG-NEXT: s_mov_b32 s1, 1 ; GCN_DBG-NEXT: s_add_i32 s0, s0, s1 ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 +; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN_DBG-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill +; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] ; GCN_DBG-NEXT: s_cbranch_scc1 .LBB3_1 ; GCN_DBG-NEXT: s_branch .LBB3_2 entry: @@ -314,7 +387,14 @@ ; ; GCN_DBG-LABEL: loop_arg_0: ; GCN_DBG: ; %bb.0: ; %entry +; GCN_DBG-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GCN_DBG-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GCN_DBG-NEXT: s_mov_b32 s10, -1 +; GCN_DBG-NEXT: s_mov_b32 s11, 0xe8f000 +; GCN_DBG-NEXT: s_add_u32 s8, s8, s3 +; GCN_DBG-NEXT: s_addc_u32 s9, s9, 0 ; GCN_DBG-NEXT: s_load_dword s0, s[0:1], 0x9 +; GCN_DBG-NEXT: ; implicit-def: $vgpr0 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 0 ; GCN_DBG-NEXT: v_mov_b32_e32 v1, 0 @@ -331,11 +411,19 @@ ; GCN_DBG-NEXT: v_writelane_b32 v0, s1, 2 ; GCN_DBG-NEXT: s_mov_b32 s0, 0 ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 3 +; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN_DBG-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill +; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] ; GCN_DBG-NEXT: s_branch .LBB4_2 ; GCN_DBG-NEXT: .LBB4_1: ; %for.exit ; GCN_DBG-NEXT: s_endpgm ; GCN_DBG-NEXT: .LBB4_2: ; %for.body ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN_DBG-NEXT: s_waitcnt expcnt(0) +; GCN_DBG-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload +; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] +; GCN_DBG-NEXT: s_waitcnt vmcnt(0) ; GCN_DBG-NEXT: v_readlane_b32 s0, v0, 3 ; GCN_DBG-NEXT: v_readlane_b32 s2, v0, 1 ; GCN_DBG-NEXT: v_readlane_b32 s3, v0, 2 @@ -358,6 +446,9 @@ ; GCN_DBG-NEXT: s_add_i32 s0, s0, s1 ; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3] ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 3 +; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN_DBG-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill +; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] ; GCN_DBG-NEXT: s_cbranch_vccnz .LBB4_1 ; GCN_DBG-NEXT: s_branch .LBB4_2 entry: diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll --- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll +++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll @@ -19,14 +19,14 @@ ; GCN-O0: s_mov_b64 s[{{[0-9:]+}}], exec ; GCN-O0-DAG: v_writelane_b32 [[VGPR:v[0-9]+]], s{{[0-9]+}}, [[OUTER_SPILL_LANE_0:[0-9]+]] ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[OUTER_SPILL_LANE_1:[0-9]+]] -; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] +; GCN-O0-DAG: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_cbranch_execz [[ENDIF_OUTER:.LBB[0-9_]+]] ; GCN-O0-NEXT: ; %bb.{{[0-9]+}}: ; GCN-O0: s_mov_b64 s[{{[0-9:]+}}], exec ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_SPILL_LANE_0:[0-9]+]] ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_SPILL_LANE_1:[0-9]+]] -; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] +; GCN-O0-DAG: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_cbranch_execz [[ENDIF_INNER:.LBB[0-9_]+]] ; GCN-O0-NEXT: ; %bb.{{[0-9]+}}: @@ -84,14 +84,14 @@ ; GCN-O0: s_mov_b64 s[{{[0-9:]+}}], exec ; GCN-O0-DAG: v_writelane_b32 [[VGPR:v[0-9]+]], s{{[0-9]+}}, [[OUTER_SPILL_LANE_0:[0-9]+]] ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[OUTER_SPILL_LANE_1:[0-9]+]] -; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}] +; GCN-O0-DAG: s_and_b64 s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_cbranch_execz [[ENDIF_OUTER:.LBB[0-9_]+]] ; GCN-O0-NEXT: ; %bb.{{[0-9]+}}: ; GCN-O0: s_mov_b64 s[{{[0-9:]+}}], exec ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_SPILL_LANE_0:[0-9]+]] ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_SPILL_LANE_1:[0-9]+]] -; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}] +; GCN-O0-DAG: s_and_b64 s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_cbranch_execz [[ENDIF_INNER:.LBB[0-9_]+]] ; GCN-O0-NEXT: ; %bb.{{[0-9]+}}: @@ -161,7 +161,7 @@ ; GCN-O0: s_mov_b64 s[{{[0-9:]+}}], exec ; GCN-O0-DAG: v_writelane_b32 [[VGPR:v[0-9]+]], s{{[0-9]+}}, [[OUTER_SPILL_LANE_0:[0-9]+]] ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[OUTER_SPILL_LANE_1:[0-9]+]] -; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}] +; GCN-O0-DAG: s_and_b64 s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_cbranch_execz [[ENDIF_OUTER:.LBB[0-9_]+]] ; GCN-O0-NEXT: ; %bb.{{[0-9]+}}: @@ -170,6 +170,9 @@ ; GCN-O0-NEXT: s_xor_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[THEN_SPILL_LANE_0:[0-9]+]] ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[THEN_SPILL_LANE_1:[0-9]+]] +; GCN-O0-NEXT: s_or_saveexec_b64 [[EXEC_COPY:s\[[0-9]+:[0-9]+\]]], -1 +; GCN-O0-NEXT: buffer_store_dword +; GCN-O0-NEXT: s_mov_b64 exec, [[EXEC_COPY]] ; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_cbranch_execz [[THEN_INNER:.LBB[0-9_]+]] ; GCN-O0-NEXT: s_branch [[TEMP_BB:.LBB[0-9_]+]] @@ -180,7 +183,7 @@ ; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}], exec, s[{{[0-9:]+}}] ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_SPILL_LANE_0:[0-9]+]] ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_SPILL_LANE_1:[0-9]+]] -; GCN-O0-NEXT: s_xor_b64 exec, exec, s[{{[0-9:]+}}] +; GCN-O0-DAG: s_xor_b64 exec, exec, s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_cbranch_execz [[ENDIF_INNER:.LBB[0-9_]+]] ; GCN-O0-NEXT: ; %bb.{{[0-9]+}}: ; GCN-O0: store_dword @@ -260,6 +263,9 @@ ; GCN-O0-NEXT: s_xor_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] ; GCN-O0-DAG: v_writelane_b32 [[VGPR:v[0-9]+]], s{{[0-9]+}}, [[OUTER_SPILL_LANE_0:[0-9]+]] ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[OUTER_SPILL_LANE_1:[0-9]+]] +; GCN-O0-NEXT: s_or_saveexec_b64 [[EXEC_COPY:s\[[0-9]+:[0-9]+\]]], -1 +; GCN-O0-NEXT: buffer_store_dword [[VGPR]] +; GCN-O0-NEXT: s_mov_b64 exec, [[EXEC_COPY]] ; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_cbranch_execz [[THEN_OUTER:.LBB[0-9_]+]] ; GCN-O0-NEXT: s_branch [[INNER_IF_OUTER_ELSE:.LBB[0-9_]+]] @@ -270,14 +276,14 @@ ; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}], exec, s[{{[0-9:]+}}] ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[OUTER_2_SPILL_LANE_0:[0-9]+]] ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[OUTER_2_SPILL_LANE_1:[0-9]+]] -; GCN-O0-NEXT: s_xor_b64 exec, exec, s[{{[0-9:]+}}] +; GCN-O0-DAG: s_xor_b64 exec, exec, s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_cbranch_execz [[ENDIF_OUTER:.LBB[0-9_]+]] ; GCN-O0-NEXT: ; %bb.{{[0-9]+}}: ; GCN-O0: store_dword ; GCN-O0: s_mov_b64 s[{{[0-9:]+}}], exec ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[ELSE_SPILL_LANE_0:[0-9]+]] ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[ELSE_SPILL_LANE_1:[0-9]+]] -; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] +; GCN-O0-DAG: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_cbranch_execz [[FLOW1:.LBB[0-9_]+]] ; GCN-O0-NEXT: ; %bb.{{[0-9]+}}: @@ -287,7 +293,7 @@ ; GCN-O0: s_mov_b64 s[{{[0-9:]+}}], exec ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_IF_OUTER_ELSE_SPILL_LANE_0:[0-9]+]] ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_IF_OUTER_ELSE_SPILL_LANE_1:[0-9]+]] -; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] +; GCN-O0-DAG: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_cbranch_execz [[THEN_OUTER_FLOW:.LBB[0-9_]+]] ; GCN-O0-NEXT: ; %bb.{{[0-9]+}}: @@ -357,7 +363,7 @@ ; GCN-O0: s_mov_b64 s[{{[0-9:]+}}], exec ; GCN-O0-DAG: v_writelane_b32 [[VGPR:v[0-9]+]], s{{[0-9]+}}, [[SPILL_LANE_0:[0-9]+]] ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[SPILL_LANE_1:[0-9]+]] -; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}] +; GCN-O0-DAG: s_and_b64 s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_cbranch_execz [[ENDIF:.LBB[0-9_]+]] ; GCN-O0-NEXT: ; %bb.{{[0-9]+}}: @@ -419,61 +425,81 @@ ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_IN_EXEC_SPILL_LANE_1:[0-9]+]] ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_0:[0-9]+]] ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_1:[0-9]+]] +; GCN-O0-NEXT: s_or_saveexec_b64 [[EXEC_COPY:s\[[0-9]+:[0-9]+\]]], -1 +; GCN-O0-NEXT: buffer_store_dword [[VGPR]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, [[EXEC_COPY]] ; GCN-O0: [[INNER_LOOP:.LBB[0-9]+_[0-9]+]]: -; GCN-O0: buffer_load_dword -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_0]] -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_1]] -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_LOOP_IN_EXEC_SPILL_LANE_0]] -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_LOOP_IN_EXEC_SPILL_LANE_1]] -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[OUTER_LOOP_EXEC_SPILL_LANE_0:[0-9]+]] -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[OUTER_LOOP_EXEC_SPILL_LANE_1:[0-9]+]] +; GCN-O0: buffer_load_dword [[RESTORED_VGPR:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 ; 4-byte Folded Reload +; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[RESTORED_VGPR]], [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_0]] +; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[RESTORED_VGPR]], [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_1]] +; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[RESTORED_VGPR]], [[INNER_LOOP_IN_EXEC_SPILL_LANE_0]] +; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[RESTORED_VGPR]], [[INNER_LOOP_IN_EXEC_SPILL_LANE_1]] +; GCN-O0-DAG: v_writelane_b32 [[RESTORED_VGPR]], s{{[0-9]+}}, [[OUTER_LOOP_EXEC_SPILL_LANE_0:[0-9]+]] +; GCN-O0-DAG: v_writelane_b32 [[RESTORED_VGPR]], s{{[0-9]+}}, [[OUTER_LOOP_EXEC_SPILL_LANE_1:[0-9]+]] ; GCN-O0: s_or_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_OUT_EXEC_SPILL_LANE_0:[0-9]+]] -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_OUT_EXEC_SPILL_LANE_1:[0-9]+]] -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_IN_EXEC_SPILL_LANE_0]] -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_IN_EXEC_SPILL_LANE_1]] +; GCN-O0-DAG: v_writelane_b32 [[RESTORED_VGPR]], s{{[0-9]+}}, [[INNER_LOOP_OUT_EXEC_SPILL_LANE_0:[0-9]+]] +; GCN-O0-DAG: v_writelane_b32 [[RESTORED_VGPR]], s{{[0-9]+}}, [[INNER_LOOP_OUT_EXEC_SPILL_LANE_1:[0-9]+]] +; GCN-O0-DAG: v_writelane_b32 [[RESTORED_VGPR]], s{{[0-9]+}}, [[INNER_LOOP_IN_EXEC_SPILL_LANE_0]] +; GCN-O0-DAG: v_writelane_b32 [[RESTORED_VGPR]], s{{[0-9]+}}, [[INNER_LOOP_IN_EXEC_SPILL_LANE_1]] ; GCN-O0-NEXT: s_mov_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}] -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_0]] -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_1]] +; GCN-O0-DAG: v_writelane_b32 [[RESTORED_VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_0]] +; GCN-O0-DAG: v_writelane_b32 [[RESTORED_VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_1]] +; GCN-O0-NEXT: s_or_saveexec_b64 [[EXEC_COPY:s\[[0-9]+:[0-9]+\]]], -1 +; GCN-O0-NEXT: buffer_store_dword [[RESTORED_VGPR]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, [[EXEC_COPY]] ; GCN-O0-NEXT: s_andn2_b64 exec, exec, s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_cbranch_execnz [[INNER_LOOP]] ; GCN-O0-NEXT: ; %bb.{{[0-9]+}}: -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_LOOP_OUT_EXEC_SPILL_LANE_0]] -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_LOOP_OUT_EXEC_SPILL_LANE_1]] +; GCN-O0: buffer_load_dword [[RESTORED_1_VGPR:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 ; 4-byte Folded Reload +; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[RESTORED_1_VGPR]], [[INNER_LOOP_OUT_EXEC_SPILL_LANE_0]] +; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[RESTORED_1_VGPR]], [[INNER_LOOP_OUT_EXEC_SPILL_LANE_1]] ; GCN-O0-NEXT: s_or_b64 exec, exec, s[{{[0-9:]+}}] ; GCN-O0: s_mov_b64 s[{{[0-9:]+}}], exec -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[FLOW2_IN_EXEC_SPILL_LANE_0:[0-9]+]] -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[FLOW2_IN_EXEC_SPILL_LANE_1:[0-9]+]] +; GCN-O0-DAG: v_writelane_b32 [[RESTORED_1_VGPR]], s{{[0-9]+}}, [[FLOW2_IN_EXEC_SPILL_LANE_0:[0-9]+]] +; GCN-O0-DAG: v_writelane_b32 [[RESTORED_1_VGPR]], s{{[0-9]+}}, [[FLOW2_IN_EXEC_SPILL_LANE_1:[0-9]+]] +; GCN-O0-NEXT: s_or_saveexec_b64 [[EXEC_COPY:s\[[0-9]+:[0-9]+\]]], -1 +; GCN-O0-NEXT: buffer_store_dword [[RESTORED_1_VGPR]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, [[EXEC_COPY]] ; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_cbranch_execz [[FLOW2:.LBB[0-9_]+]] ; GCN-O0: {{^}}[[FLOW2]]: -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[FLOW2_IN_EXEC_SPILL_LANE_0]] -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[FLOW2_IN_EXEC_SPILL_LANE_1]] +; GCN-O0: buffer_load_dword [[RESTORED_2_VGPR:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 ; 4-byte Folded Reload +; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[RESTORED_2_VGPR]], [[FLOW2_IN_EXEC_SPILL_LANE_0]] +; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[RESTORED_2_VGPR]], [[FLOW2_IN_EXEC_SPILL_LANE_1]] ; GCN-O0: s_branch [[FLOW:.LBB[0-9_]+]] ; GCN-O0: {{^}}[[FLOW]]: +; GCN-O0: buffer_load_dword [[RESTORED_3_VGPR:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 ; 4-byte Folded Reload ; GCN-O0: s_mov_b64 s[{{[0-9:]+}}], exec -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[FLOW3_IN_EXEC_SPILL_LANE_0:[0-9]+]] -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[FLOW3_IN_EXEC_SPILL_LANE_1:[0-9]+]] +; GCN-O0-DAG: v_writelane_b32 [[RESTORED_3_VGPR]], s{{[0-9]+}}, [[FLOW3_IN_EXEC_SPILL_LANE_0:[0-9]+]] +; GCN-O0-DAG: v_writelane_b32 [[RESTORED_3_VGPR]], s{{[0-9]+}}, [[FLOW3_IN_EXEC_SPILL_LANE_1:[0-9]+]] +; GCN-O0-NEXT: s_or_saveexec_b64 [[EXEC_COPY:s\[[0-9]+:[0-9]+\]]], -1 +; GCN-O0-NEXT: buffer_store_dword [[RESTORED_3_VGPR]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, [[EXEC_COPY]] ; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_cbranch_execz [[FLOW3:.LBB[0-9_]+]] ; GCN-O0: ; %bb.{{[0-9]+}}: -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[FLOW1_OUT_EXEC_SPILL_LANE_0:[0-9]+]] -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[FLOW1_OUT_EXEC_SPILL_LANE_1:[0-9]+]] +; GCN-O0: buffer_load_dword [[RESTORED_4_VGPR:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 ; 4-byte Folded Reload +; GCN-O0-DAG: v_writelane_b32 [[RESTORED_4_VGPR]], s{{[0-9]+}}, [[FLOW1_OUT_EXEC_SPILL_LANE_0:[0-9]+]] +; GCN-O0-DAG: v_writelane_b32 [[RESTORED_4_VGPR]], s{{[0-9]+}}, [[FLOW1_OUT_EXEC_SPILL_LANE_1:[0-9]+]] +; GCN-O0-NEXT: s_or_saveexec_b64 [[EXEC_COPY:s\[[0-9]+:[0-9]+\]]], -1 +; GCN-O0-NEXT: buffer_store_dword [[RESTORED_4_VGPR]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, [[EXEC_COPY]] ; GCN-O0: {{^}}[[FLOW3]]: ; GCN-O0-COUNT-4: buffer_load_dword -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[OUTER_LOOP_EXEC_SPILL_LANE_0]] -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[OUTER_LOOP_EXEC_SPILL_LANE_1]] -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[FLOW1_OUT_EXEC_SPILL_LANE_0]] -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[FLOW1_OUT_EXEC_SPILL_LANE_1]] +; GCN-O0: buffer_load_dword [[RESTORED_5_VGPR:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 ; 4-byte Folded Reload +; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[RESTORED_5_VGPR]], [[OUTER_LOOP_EXEC_SPILL_LANE_0]] +; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[RESTORED_5_VGPR]], [[OUTER_LOOP_EXEC_SPILL_LANE_1]] +; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[RESTORED_5_VGPR]], [[FLOW1_OUT_EXEC_SPILL_LANE_0]] +; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[RESTORED_5_VGPR]], [[FLOW1_OUT_EXEC_SPILL_LANE_1]] ; GCN-O0: s_and_b64 s[{{[0-9:]+}}], exec, s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_or_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] ; GCN-O0-COUNT-2: s_mov_b64 -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_IN_EXEC_SPILL_LANE_0]] -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_IN_EXEC_SPILL_LANE_1]] -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_0]] -; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_1]] +; GCN-O0-DAG: v_writelane_b32 [[RESTORED_5_VGPR]], s{{[0-9]+}}, [[INNER_LOOP_IN_EXEC_SPILL_LANE_0]] +; GCN-O0-DAG: v_writelane_b32 [[RESTORED_5_VGPR]], s{{[0-9]+}}, [[INNER_LOOP_IN_EXEC_SPILL_LANE_1]] +; GCN-O0-DAG: v_writelane_b32 [[RESTORED_5_VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_0]] +; GCN-O0-DAG: v_writelane_b32 [[RESTORED_5_VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_1]] ; GCN-O0-COUNT-4: buffer_store_dword ; GCN-O0: s_andn2_b64 exec, exec, s[{{[0-9:]+}}] ; GCN-O0-NEXT: s_cbranch_execnz [[INNER_LOOP]] diff --git a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll --- a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll @@ -10,7 +10,7 @@ ; GCN-LABEL: {{^}}divergent_if_endif: -; VGPR: workitem_private_segment_byte_size = 12{{$}} +; VGPR: workitem_private_segment_byte_size = 16{{$}} ; GCN: {{^}}; %bb.0: @@ -19,7 +19,7 @@ ; Spill load ; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], 0 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill -; GCN: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]\]]], v0, s{{[0-9]+}} +; GCN: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]\]]], v{{[0-9]+}}, s{{[0-9]+}} ; Spill saved exec ; GCN: s_mov_b64 s[[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]], exec @@ -82,13 +82,13 @@ } ; GCN-LABEL: {{^}}divergent_loop: -; VGPR: workitem_private_segment_byte_size = 16{{$}} +; VGPR: workitem_private_segment_byte_size = 20{{$}} ; GCN: {{^}}; %bb.0: ; GCN-DAG: s_mov_b32 m0, -1 ; GCN-DAG: v_mov_b32_e32 [[PTR0:v[0-9]+]], 0{{$}} ; GCN: ds_read_b32 [[LOAD0:v[0-9]+]], [[PTR0]] -; GCN: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], v0, s{{[0-9]+}} +; GCN: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, s{{[0-9]+}} ; Spill load ; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], 0 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill @@ -166,7 +166,7 @@ ; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], 0 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill ; GCN: s_mov_b32 [[ZERO:s[0-9]+]], 0 -; GCN: v_cmp_ne_u32_e64 [[CMP0:s\[[0-9]+:[0-9]\]]], v0, [[ZERO]] +; GCN: v_cmp_ne_u32_e64 [[CMP0:s\[[0-9]+:[0-9]\]]], v{{[0-9]+}}, [[ZERO]] ; GCN: s_mov_b64 s[[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]], exec ; GCN: s_and_b64 s[[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]], s[[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]], [[CMP0]] @@ -187,6 +187,7 @@ ; GCN-NEXT: s_branch [[ELSE:.LBB[0-9]+_[0-9]+]] ; GCN: [[FLOW]]: ; %Flow +; VGPR: buffer_load_dword ; GCN: buffer_load_dword [[FLOW_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[FLOW_VAL_OFFSET:[0-9]+]] ; 4-byte Folded Reload ; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]] ; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]] diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll --- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll @@ -33,6 +33,7 @@ ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[18:19] +; GCN-NEXT: ; implicit-def: $vgpr40 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v41, s16, 0 @@ -71,6 +72,7 @@ ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[18:19] +; GCN-NEXT: ; implicit-def: $vgpr40 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v41, s16, 0 @@ -109,6 +111,7 @@ ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[18:19] +; GCN-NEXT: ; implicit-def: $vgpr40 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v41, s16, 0 @@ -147,6 +150,7 @@ ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[18:19] +; GCN-NEXT: ; implicit-def: $vgpr40 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v41, s16, 0 diff --git a/llvm/test/CodeGen/AMDGPU/csr-sgpr-spill-live-ins.mir b/llvm/test/CodeGen/AMDGPU/csr-sgpr-spill-live-ins.mir --- a/llvm/test/CodeGen/AMDGPU/csr-sgpr-spill-live-ins.mir +++ b/llvm/test/CodeGen/AMDGPU/csr-sgpr-spill-live-ins.mir @@ -19,10 +19,11 @@ ; CHECK-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5) ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr42, 0, $vgpr0 - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr43, 1, $vgpr0 - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr46, 2, $vgpr0 - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr47, 3, $vgpr0 + ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr42, 0, killed $vgpr0 + ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr43, 1, killed $vgpr0 + ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr46, 2, killed $vgpr0 + ; CHECK-NEXT: dead renamable $vgpr0 = V_WRITELANE_B32 $sgpr47, 3, killed $vgpr0 ; CHECK-NEXT: S_NOP 0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: diff --git a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll --- a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll +++ b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll @@ -19,25 +19,26 @@ ; CHECK-NEXT: s_mov_b32 s16, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 -; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[18:19] -; CHECK-NEXT: v_writelane_b32 v40, s30, 0 -; CHECK-NEXT: v_writelane_b32 v40, s31, 1 -; CHECK-NEXT: v_writelane_b32 v40, s34, 2 -; CHECK-NEXT: v_writelane_b32 v40, s35, 3 -; CHECK-NEXT: v_writelane_b32 v40, s36, 4 -; CHECK-NEXT: v_writelane_b32 v40, s37, 5 -; CHECK-NEXT: v_writelane_b32 v40, s38, 6 -; CHECK-NEXT: v_writelane_b32 v40, s39, 7 -; CHECK-NEXT: v_writelane_b32 v40, s40, 8 -; CHECK-NEXT: v_writelane_b32 v40, s41, 9 -; CHECK-NEXT: v_writelane_b32 v40, s42, 10 -; CHECK-NEXT: v_writelane_b32 v40, s43, 11 -; CHECK-NEXT: v_writelane_b32 v40, s44, 12 +; CHECK-NEXT: ; implicit-def: $vgpr41 ; CHECK-NEXT: s_addk_i32 s32, 0x400 -; CHECK-NEXT: v_writelane_b32 v40, s45, 13 -; CHECK-NEXT: v_writelane_b32 v40, s46, 14 +; CHECK-NEXT: v_writelane_b32 v41, s30, 0 +; CHECK-NEXT: v_writelane_b32 v41, s31, 1 +; CHECK-NEXT: v_writelane_b32 v41, s34, 2 +; CHECK-NEXT: v_writelane_b32 v41, s35, 3 +; CHECK-NEXT: v_writelane_b32 v41, s36, 4 +; CHECK-NEXT: v_writelane_b32 v41, s37, 5 +; CHECK-NEXT: v_writelane_b32 v41, s38, 6 +; CHECK-NEXT: v_writelane_b32 v41, s39, 7 +; CHECK-NEXT: v_writelane_b32 v41, s40, 8 +; CHECK-NEXT: v_writelane_b32 v41, s41, 9 +; CHECK-NEXT: v_writelane_b32 v41, s42, 10 +; CHECK-NEXT: v_writelane_b32 v41, s43, 11 +; CHECK-NEXT: v_writelane_b32 v41, s44, 12 +; CHECK-NEXT: v_writelane_b32 v41, s45, 13 +; CHECK-NEXT: v_writelane_b32 v41, s46, 14 ; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5] ; CHECK-NEXT: ;DEBUG_VALUE: dummy:dummy <- undef ; CHECK-NEXT: .Ltmp0: @@ -45,12 +46,12 @@ ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, __kmpc_alloc_shared@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, __kmpc_alloc_shared@gotpcrel32@hi+12 -; CHECK-NEXT: v_writelane_b32 v40, s47, 15 +; CHECK-NEXT: v_writelane_b32 v41, s47, 15 ; CHECK-NEXT: s_load_dwordx2 s[46:47], s[4:5], 0x0 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] ; CHECK-NEXT: v_writelane_b32 v42, s16, 0 -; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; CHECK-NEXT: v_mov_b32_e32 v41, v31 +; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; CHECK-NEXT: v_mov_b32_e32 v40, v31 ; CHECK-NEXT: s_mov_b32 s42, s15 ; CHECK-NEXT: s_mov_b32 s43, s14 ; CHECK-NEXT: s_mov_b32 s44, s13 @@ -68,33 +69,33 @@ ; CHECK-NEXT: s_mov_b32 s13, s44 ; CHECK-NEXT: s_mov_b32 s14, s43 ; CHECK-NEXT: s_mov_b32 s15, s42 -; CHECK-NEXT: v_mov_b32_e32 v31, v41 +; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[46:47] ; CHECK-NEXT: .Ltmp1: ; CHECK-NEXT: ;DEBUG_VALUE: dummy:dummy <- [$vgpr0_vgpr1+0] ; CHECK-NEXT: .loc 1 0 9 is_stmt 0 ; dummy:0:9 -; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: flat_store_dword v[0:1], v2 -; CHECK-NEXT: v_readlane_b32 s47, v40, 15 -; CHECK-NEXT: v_readlane_b32 s46, v40, 14 -; CHECK-NEXT: v_readlane_b32 s45, v40, 13 -; CHECK-NEXT: v_readlane_b32 s44, v40, 12 -; CHECK-NEXT: v_readlane_b32 s43, v40, 11 -; CHECK-NEXT: v_readlane_b32 s42, v40, 10 -; CHECK-NEXT: v_readlane_b32 s41, v40, 9 -; CHECK-NEXT: v_readlane_b32 s40, v40, 8 -; CHECK-NEXT: v_readlane_b32 s39, v40, 7 -; CHECK-NEXT: v_readlane_b32 s38, v40, 6 -; CHECK-NEXT: v_readlane_b32 s37, v40, 5 -; CHECK-NEXT: v_readlane_b32 s36, v40, 4 -; CHECK-NEXT: v_readlane_b32 s35, v40, 3 -; CHECK-NEXT: v_readlane_b32 s34, v40, 2 -; CHECK-NEXT: v_readlane_b32 s31, v40, 1 -; CHECK-NEXT: v_readlane_b32 s30, v40, 0 +; CHECK-NEXT: v_readlane_b32 s47, v41, 15 +; CHECK-NEXT: v_readlane_b32 s46, v41, 14 +; CHECK-NEXT: v_readlane_b32 s45, v41, 13 +; CHECK-NEXT: v_readlane_b32 s44, v41, 12 +; CHECK-NEXT: v_readlane_b32 s43, v41, 11 +; CHECK-NEXT: v_readlane_b32 s42, v41, 10 +; CHECK-NEXT: v_readlane_b32 s41, v41, 9 +; CHECK-NEXT: v_readlane_b32 s40, v41, 8 +; CHECK-NEXT: v_readlane_b32 s39, v41, 7 +; CHECK-NEXT: v_readlane_b32 s38, v41, 6 +; CHECK-NEXT: v_readlane_b32 s37, v41, 5 +; CHECK-NEXT: v_readlane_b32 s36, v41, 4 +; CHECK-NEXT: v_readlane_b32 s35, v41, 3 +; CHECK-NEXT: v_readlane_b32 s34, v41, 2 +; CHECK-NEXT: v_readlane_b32 s31, v41, 1 +; CHECK-NEXT: v_readlane_b32 s30, v41, 0 ; CHECK-NEXT: v_readlane_b32 s4, v42, 0 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 -; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[6:7] ; CHECK-NEXT: s_addk_i32 s32, 0xfc00 diff --git a/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll b/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll --- a/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll @@ -25,6 +25,7 @@ ; GCN-NEXT: s_add_u32 s16, s16, extern_func@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s17, s17, extern_func@gotpcrel32@hi+12 ; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GCN-NEXT: ; implicit-def: $vgpr42 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: v_writelane_b32 v42, s30, 0 diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll @@ -119,21 +119,15 @@ ; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 ; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; FLAT_SCR_OPT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; FLAT_SCR_OPT-NEXT: s_mov_b32 s104, exec_lo -; FLAT_SCR_OPT-NEXT: s_mov_b32 exec_lo, 3 -; FLAT_SCR_OPT-NEXT: s_mov_b32 s4, 0 -; FLAT_SCR_OPT-NEXT: scratch_store_dword off, v72, s4 +; FLAT_SCR_OPT-NEXT: ; implicit-def: $vgpr0 ; FLAT_SCR_OPT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT_SCR_OPT-NEXT: v_writelane_b32 v72, s2, 0 -; FLAT_SCR_OPT-NEXT: s_mov_b32 s4, 4 -; FLAT_SCR_OPT-NEXT: v_writelane_b32 v72, s3, 1 -; FLAT_SCR_OPT-NEXT: scratch_store_dword off, v72, s4 ; 4-byte Folded Spill +; FLAT_SCR_OPT-NEXT: v_writelane_b32 v0, s2, 0 +; FLAT_SCR_OPT-NEXT: v_writelane_b32 v0, s3, 1 +; FLAT_SCR_OPT-NEXT: s_or_saveexec_b32 s105, -1 +; FLAT_SCR_OPT-NEXT: s_mov_b32 s2, 4 +; FLAT_SCR_OPT-NEXT: scratch_store_dword off, v0, s2 ; 4-byte Folded Spill ; FLAT_SCR_OPT-NEXT: s_waitcnt_depctr 0xffe3 -; FLAT_SCR_OPT-NEXT: s_mov_b32 s4, 0 -; FLAT_SCR_OPT-NEXT: scratch_load_dword v72, off, s4 -; FLAT_SCR_OPT-NEXT: s_waitcnt vmcnt(0) -; FLAT_SCR_OPT-NEXT: s_waitcnt_depctr 0xffe3 -; FLAT_SCR_OPT-NEXT: s_mov_b32 exec_lo, s104 +; FLAT_SCR_OPT-NEXT: s_mov_b32 exec_lo, s105 ; FLAT_SCR_OPT-NEXT: s_load_dword vcc_lo, s[0:1], 0x8 ; FLAT_SCR_OPT-NEXT: ; kill: killed $sgpr0_sgpr1 ; FLAT_SCR_OPT-NEXT: ;;#ASMSTART @@ -230,22 +224,14 @@ ; FLAT_SCR_OPT-NEXT: ;;#ASMEND ; FLAT_SCR_OPT-NEXT: ;;#ASMSTART ; FLAT_SCR_OPT-NEXT: ;;#ASMEND -; FLAT_SCR_OPT-NEXT: s_mov_b32 s2, exec_lo -; FLAT_SCR_OPT-NEXT: s_mov_b32 exec_lo, 3 -; FLAT_SCR_OPT-NEXT: s_mov_b32 s3, 0 -; FLAT_SCR_OPT-NEXT: scratch_store_dword off, v2, s3 -; FLAT_SCR_OPT-NEXT: s_waitcnt_depctr 0xffe3 -; FLAT_SCR_OPT-NEXT: s_mov_b32 s3, 4 -; FLAT_SCR_OPT-NEXT: scratch_load_dword v2, off, s3 ; 4-byte Folded Reload +; FLAT_SCR_OPT-NEXT: s_or_saveexec_b32 s105, -1 +; FLAT_SCR_OPT-NEXT: s_mov_b32 s0, 4 +; FLAT_SCR_OPT-NEXT: scratch_load_dword v1, off, s0 ; 4-byte Folded Reload ; FLAT_SCR_OPT-NEXT: s_waitcnt_depctr 0xffe3 -; FLAT_SCR_OPT-NEXT: s_mov_b32 s3, 0 +; FLAT_SCR_OPT-NEXT: s_mov_b32 exec_lo, s105 ; FLAT_SCR_OPT-NEXT: s_waitcnt vmcnt(0) -; FLAT_SCR_OPT-NEXT: v_readlane_b32 s0, v2, 0 -; FLAT_SCR_OPT-NEXT: v_readlane_b32 s1, v2, 1 -; FLAT_SCR_OPT-NEXT: scratch_load_dword v2, off, s3 -; FLAT_SCR_OPT-NEXT: s_waitcnt vmcnt(0) -; FLAT_SCR_OPT-NEXT: s_waitcnt_depctr 0xffe3 -; FLAT_SCR_OPT-NEXT: s_mov_b32 exec_lo, s2 +; FLAT_SCR_OPT-NEXT: v_readlane_b32 s0, v1, 0 +; FLAT_SCR_OPT-NEXT: v_readlane_b32 s1, v1, 1 ; FLAT_SCR_OPT-NEXT: v_mov_b32_e32 v1, 0 ; FLAT_SCR_OPT-NEXT: global_store_dword v1, v0, s[0:1] ; FLAT_SCR_OPT-NEXT: s_endpgm @@ -253,21 +239,15 @@ ; FLAT_SCR_ARCH-LABEL: test: ; FLAT_SCR_ARCH: ; %bb.0: ; FLAT_SCR_ARCH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; FLAT_SCR_ARCH-NEXT: s_mov_b32 s104, exec_lo -; FLAT_SCR_ARCH-NEXT: s_mov_b32 exec_lo, 3 -; FLAT_SCR_ARCH-NEXT: s_mov_b32 s4, 0 -; FLAT_SCR_ARCH-NEXT: scratch_store_dword off, v72, s4 +; FLAT_SCR_ARCH-NEXT: ; implicit-def: $vgpr0 ; FLAT_SCR_ARCH-NEXT: s_waitcnt lgkmcnt(0) -; FLAT_SCR_ARCH-NEXT: v_writelane_b32 v72, s2, 0 -; FLAT_SCR_ARCH-NEXT: s_mov_b32 s4, 4 -; FLAT_SCR_ARCH-NEXT: v_writelane_b32 v72, s3, 1 -; FLAT_SCR_ARCH-NEXT: scratch_store_dword off, v72, s4 ; 4-byte Folded Spill +; FLAT_SCR_ARCH-NEXT: v_writelane_b32 v0, s2, 0 +; FLAT_SCR_ARCH-NEXT: v_writelane_b32 v0, s3, 1 +; FLAT_SCR_ARCH-NEXT: s_or_saveexec_b32 s105, -1 +; FLAT_SCR_ARCH-NEXT: s_mov_b32 s2, 4 +; FLAT_SCR_ARCH-NEXT: scratch_store_dword off, v0, s2 ; 4-byte Folded Spill ; FLAT_SCR_ARCH-NEXT: s_waitcnt_depctr 0xffe3 -; FLAT_SCR_ARCH-NEXT: s_mov_b32 s4, 0 -; FLAT_SCR_ARCH-NEXT: scratch_load_dword v72, off, s4 -; FLAT_SCR_ARCH-NEXT: s_waitcnt vmcnt(0) -; FLAT_SCR_ARCH-NEXT: s_waitcnt_depctr 0xffe3 -; FLAT_SCR_ARCH-NEXT: s_mov_b32 exec_lo, s104 +; FLAT_SCR_ARCH-NEXT: s_mov_b32 exec_lo, s105 ; FLAT_SCR_ARCH-NEXT: s_load_dword vcc_lo, s[0:1], 0x8 ; FLAT_SCR_ARCH-NEXT: ; kill: killed $sgpr0_sgpr1 ; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART @@ -364,22 +344,14 @@ ; FLAT_SCR_ARCH-NEXT: ;;#ASMEND ; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART ; FLAT_SCR_ARCH-NEXT: ;;#ASMEND -; FLAT_SCR_ARCH-NEXT: s_mov_b32 s2, exec_lo -; FLAT_SCR_ARCH-NEXT: s_mov_b32 exec_lo, 3 -; FLAT_SCR_ARCH-NEXT: s_mov_b32 s3, 0 -; FLAT_SCR_ARCH-NEXT: scratch_store_dword off, v2, s3 -; FLAT_SCR_ARCH-NEXT: s_waitcnt_depctr 0xffe3 -; FLAT_SCR_ARCH-NEXT: s_mov_b32 s3, 4 -; FLAT_SCR_ARCH-NEXT: scratch_load_dword v2, off, s3 ; 4-byte Folded Reload +; FLAT_SCR_ARCH-NEXT: s_or_saveexec_b32 s105, -1 +; FLAT_SCR_ARCH-NEXT: s_mov_b32 s0, 4 +; FLAT_SCR_ARCH-NEXT: scratch_load_dword v1, off, s0 ; 4-byte Folded Reload ; FLAT_SCR_ARCH-NEXT: s_waitcnt_depctr 0xffe3 -; FLAT_SCR_ARCH-NEXT: s_mov_b32 s3, 0 +; FLAT_SCR_ARCH-NEXT: s_mov_b32 exec_lo, s105 ; FLAT_SCR_ARCH-NEXT: s_waitcnt vmcnt(0) -; FLAT_SCR_ARCH-NEXT: v_readlane_b32 s0, v2, 0 -; FLAT_SCR_ARCH-NEXT: v_readlane_b32 s1, v2, 1 -; FLAT_SCR_ARCH-NEXT: scratch_load_dword v2, off, s3 -; FLAT_SCR_ARCH-NEXT: s_waitcnt vmcnt(0) -; FLAT_SCR_ARCH-NEXT: s_waitcnt_depctr 0xffe3 -; FLAT_SCR_ARCH-NEXT: s_mov_b32 exec_lo, s2 +; FLAT_SCR_ARCH-NEXT: v_readlane_b32 s0, v1, 0 +; FLAT_SCR_ARCH-NEXT: v_readlane_b32 s1, v1, 1 ; FLAT_SCR_ARCH-NEXT: v_mov_b32_e32 v1, 0 ; FLAT_SCR_ARCH-NEXT: global_store_dword v1, v0, s[0:1] ; FLAT_SCR_ARCH-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir b/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir --- a/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir @@ -12,14 +12,13 @@ body: | bb.0: ; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_exec_lo - ; CHECK: liveins: $vgpr0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: S_NOP 0, implicit-def $exec_lo + ; CHECK: S_NOP 0, implicit-def $exec_lo ; CHECK-NEXT: $sgpr0 = S_MOV_B32 $exec_lo - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 + ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: S_NOP 0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0, implicit killed renamable $sgpr0 - ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 + ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0 ; CHECK-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0 ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec S_NOP 0, implicit-def $exec_lo @@ -38,14 +37,13 @@ body: | bb.0: ; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_exec_hi - ; CHECK: liveins: $vgpr0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: S_NOP 0, implicit-def $exec_hi + ; CHECK: S_NOP 0, implicit-def $exec_hi ; CHECK-NEXT: $sgpr0 = S_MOV_B32 $exec_hi - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 + ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: S_NOP 0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0, implicit killed renamable $sgpr0 - ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 + ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0 ; CHECK-NEXT: $exec_hi = S_MOV_B32 killed $sgpr0 ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec S_NOP 0, implicit-def $exec_hi @@ -64,17 +62,16 @@ body: | bb.0: ; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_exec - ; CHECK: liveins: $vgpr0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: S_NOP 0, implicit-def $exec + ; CHECK: S_NOP 0, implicit-def $exec ; CHECK-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1 - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, $vgpr0, implicit $sgpr0_sgpr1 + ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1 + ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, killed $vgpr0, implicit $sgpr0_sgpr1 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1 ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1 ; CHECK-NEXT: S_NOP 0, implicit-def dead renamable $sgpr2_sgpr3, implicit-def dead renamable $sgpr0_sgpr1, implicit killed renamable $sgpr0_sgpr1 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1 - ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1 + ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 killed $vgpr0, 1 ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1 ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec S_NOP 0, implicit-def $exec @@ -96,13 +93,12 @@ body: | bb.0: ; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_exec_lo - ; CHECK: liveins: $vgpr0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $exec_lo - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 + ; CHECK: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $exec_lo + ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0 - ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 + ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0 ; CHECK-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0 ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec S_NOP 0, implicit-def %0:sreg_32, implicit-def %1:sreg_32, implicit-def $exec_lo @@ -120,13 +116,12 @@ body: | bb.0: ; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_exec_hi - ; CHECK: liveins: $vgpr0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $exec_hi - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 + ; CHECK: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $exec_hi + ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0 - ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 + ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0 ; CHECK-NEXT: $exec_hi = S_MOV_B32 killed $sgpr0 ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec S_NOP 0, implicit-def %0:sreg_32, implicit-def %1:sreg_32, implicit-def $exec_hi @@ -144,16 +139,15 @@ body: | bb.0: ; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_exec - ; CHECK: liveins: $vgpr0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: S_NOP 0, implicit-def renamable $sgpr0_sgpr1, implicit-def dead renamable $sgpr2_sgpr3, implicit-def $exec - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1 - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, $vgpr0, implicit $sgpr0_sgpr1 + ; CHECK: S_NOP 0, implicit-def renamable $sgpr0_sgpr1, implicit-def dead renamable $sgpr2_sgpr3, implicit-def $exec + ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1 + ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, killed $vgpr0, implicit $sgpr0_sgpr1 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1 ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1 ; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr0_sgpr1, implicit-def dead renamable $sgpr2_sgpr3, implicit-def dead renamable $sgpr0_sgpr1 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1 - ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1 + ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 killed $vgpr0, 1 ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1 ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec S_NOP 0, implicit-def %0:sreg_64, implicit-def %1:sreg_64, implicit-def $exec diff --git a/llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir b/llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir --- a/llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir @@ -13,14 +13,13 @@ bb.0: ; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_m0 - ; CHECK: liveins: $vgpr0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: S_NOP 0, implicit-def $m0 + ; CHECK: S_NOP 0, implicit-def $m0 ; CHECK-NEXT: $sgpr0 = S_MOV_B32 $m0 - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 + ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: S_NOP 0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0, implicit killed renamable $sgpr0 - ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 + ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0 ; CHECK-NEXT: $m0 = S_MOV_B32 killed $sgpr0 ; CHECK-NEXT: S_NOP 0 ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec @@ -44,13 +43,12 @@ bb.0: ; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_m0 - ; CHECK: liveins: $vgpr0 - ; CHECK-NEXT: {{ $}} + ; CHECK: renamable $vgpr0 = IMPLICIT_DEF ; CHECK-NEXT: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $m0 - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 + ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0 - ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 + ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0 ; CHECK-NEXT: $m0 = S_MOV_B32 killed $sgpr0 ; CHECK-NEXT: S_NOP 0 ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll --- a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll @@ -16,6 +16,7 @@ ; SPILL-TO-VGPR-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; SPILL-TO-VGPR-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[8:9] +; SPILL-TO-VGPR-NEXT: ; implicit-def: $vgpr40 ; SPILL-TO-VGPR-NEXT: s_addk_i32 s32, 0x400 ; SPILL-TO-VGPR-NEXT: v_writelane_b32 v40, s30, 0 ; SPILL-TO-VGPR-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll b/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll --- a/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll @@ -8,11 +8,13 @@ ; SDAG-LABEL: gfx_func: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: s_mov_b32 s36, s33 +; SDAG-NEXT: s_mov_b32 s38, s33 ; SDAG-NEXT: s_mov_b32 s33, s32 ; SDAG-NEXT: s_or_saveexec_b64 s[34:35], -1 ; SDAG-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; SDAG-NEXT: s_mov_b64 exec, s[34:35] +; SDAG-NEXT: ; implicit-def: $vgpr40 +; SDAG-NEXT: s_addk_i32 s32, 0x400 ; SDAG-NEXT: v_writelane_b32 v40, s4, 0 ; SDAG-NEXT: v_writelane_b32 v40, s5, 1 ; SDAG-NEXT: v_writelane_b32 v40, s6, 2 @@ -33,7 +35,6 @@ ; SDAG-NEXT: v_writelane_b32 v40, s21, 17 ; SDAG-NEXT: v_writelane_b32 v40, s22, 18 ; SDAG-NEXT: v_writelane_b32 v40, s23, 19 -; SDAG-NEXT: s_addk_i32 s32, 0x400 ; SDAG-NEXT: v_writelane_b32 v40, s24, 20 ; SDAG-NEXT: v_writelane_b32 v40, s25, 21 ; SDAG-NEXT: s_getpc_b64 s[34:35] @@ -81,18 +82,20 @@ ; SDAG-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; SDAG-NEXT: s_mov_b64 exec, s[34:35] ; SDAG-NEXT: s_addk_i32 s32, 0xfc00 -; SDAG-NEXT: s_mov_b32 s33, s36 +; SDAG-NEXT: s_mov_b32 s33, s38 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: gfx_func: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s36, s33 +; GISEL-NEXT: s_mov_b32 s38, s33 ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GISEL-NEXT: s_mov_b64 exec, s[34:35] +; GISEL-NEXT: ; implicit-def: $vgpr40 +; GISEL-NEXT: s_addk_i32 s32, 0x400 ; GISEL-NEXT: v_writelane_b32 v40, s4, 0 ; GISEL-NEXT: v_writelane_b32 v40, s5, 1 ; GISEL-NEXT: v_writelane_b32 v40, s6, 2 @@ -113,7 +116,6 @@ ; GISEL-NEXT: v_writelane_b32 v40, s21, 17 ; GISEL-NEXT: v_writelane_b32 v40, s22, 18 ; GISEL-NEXT: v_writelane_b32 v40, s23, 19 -; GISEL-NEXT: s_addk_i32 s32, 0x400 ; GISEL-NEXT: v_writelane_b32 v40, s24, 20 ; GISEL-NEXT: v_writelane_b32 v40, s25, 21 ; GISEL-NEXT: s_getpc_b64 s[34:35] @@ -161,7 +163,7 @@ ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[34:35] ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 -; GISEL-NEXT: s_mov_b32 s33, s36 +; GISEL-NEXT: s_mov_b32 s33, s38 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] call void @extern_c_func() diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll @@ -103,6 +103,7 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1 @@ -136,15 +137,16 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i1@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i1@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 @@ -171,16 +173,18 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: v_mov_b32_e32 v0, 1 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i1@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i1@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: scratch_store_b8 off, v0, s32 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: v_readlane_b32 s0, v41, 0 @@ -205,15 +209,16 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i1@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i1@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 @@ -244,6 +249,7 @@ ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 @@ -251,6 +257,7 @@ ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_i1_signext@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s35, s35, external_void_func_i1_signext@rel32@hi+12 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s32 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -279,13 +286,15 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i1_signext@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i1_signext@rel32@hi+12 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -316,13 +325,15 @@ ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_u8 v0, v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i1_signext@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i1_signext@rel32@hi+12 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11-NEXT: scratch_store_b8 off, v0, s32 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -352,13 +363,15 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i1_signext@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i1_signext@rel32@hi+12 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -392,6 +405,7 @@ ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 @@ -399,6 +413,7 @@ ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_i1_zeroext@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s35, s35, external_void_func_i1_zeroext@rel32@hi+12 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s32 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -427,13 +442,15 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i1_zeroext@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i1_zeroext@rel32@hi+12 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -464,13 +481,15 @@ ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_u8 v0, v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i1_zeroext@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i1_zeroext@rel32@hi+12 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11-NEXT: scratch_store_b8 off, v0, s32 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -500,13 +519,15 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i1_zeroext@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i1_zeroext@rel32@hi+12 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -538,6 +559,7 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b @@ -570,8 +592,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] @@ -604,8 +627,9 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] @@ -638,8 +662,9 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] @@ -676,6 +701,7 @@ ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: global_load_sbyte v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 @@ -709,8 +735,9 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_sbyte v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i8_signext@rel32@lo+4 @@ -744,8 +771,9 @@ ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_i8 v0, v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i8_signext@rel32@lo+4 @@ -779,8 +807,9 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_sbyte v0, v[0:1], off glc dlc ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i8_signext@rel32@lo+4 @@ -817,6 +846,7 @@ ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 @@ -850,8 +880,9 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i8_zeroext@rel32@lo+4 @@ -885,8 +916,9 @@ ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_u8 v0, v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i8_zeroext@rel32@lo+4 @@ -920,8 +952,9 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i8_zeroext@rel32@lo+4 @@ -956,6 +989,7 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b @@ -988,8 +1022,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] @@ -1022,8 +1057,9 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] @@ -1056,8 +1092,9 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] @@ -1094,6 +1131,7 @@ ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 @@ -1127,8 +1165,9 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_ushort v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i16_signext@rel32@lo+4 @@ -1162,8 +1201,9 @@ ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_u16 v0, v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i16_signext@rel32@lo+4 @@ -1197,8 +1237,9 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_ushort v0, v[0:1], off glc dlc ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i16_signext@rel32@lo+4 @@ -1235,6 +1276,7 @@ ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 @@ -1268,8 +1310,9 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_ushort v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i16_zeroext@rel32@lo+4 @@ -1303,8 +1346,9 @@ ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_u16 v0, v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i16_zeroext@rel32@lo+4 @@ -1338,8 +1382,9 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_ushort v0, v[0:1], off glc dlc ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i16_zeroext@rel32@lo+4 @@ -1374,6 +1419,7 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 @@ -1406,8 +1452,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] @@ -1440,8 +1487,9 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] @@ -1474,8 +1522,9 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 42 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] @@ -1510,6 +1559,7 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b @@ -1543,15 +1593,16 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i64@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i64@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 @@ -1578,16 +1629,17 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i64@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i64@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: v_readlane_b32 s0, v41, 0 @@ -1612,15 +1664,16 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i64@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i64@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 @@ -1652,6 +1705,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 @@ -1685,8 +1739,9 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i64@rel32@lo+4 @@ -1721,8 +1776,9 @@ ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i64@rel32@lo+4 @@ -1757,8 +1813,9 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i64@rel32@lo+4 @@ -1794,6 +1851,7 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1 @@ -1829,8 +1887,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-NEXT: v_mov_b32_e32 v3, 4 @@ -1866,8 +1925,9 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 @@ -1901,8 +1961,9 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 4 @@ -1943,6 +2004,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v4, 1 @@ -1978,8 +2040,9 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_mov_b32_e32 v4, 1 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, 2 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off @@ -2016,8 +2079,9 @@ ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v5, 2 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, 1 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off ; GFX11-NEXT: s_getpc_b64 s[0:1] @@ -2052,8 +2116,9 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 2 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[0:1], off @@ -2096,6 +2161,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v4, 1 @@ -2133,8 +2199,9 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_mov_b32_e32 v4, 1 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, 2 ; GFX10-NEXT: v_mov_b32_e32 v6, 3 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off @@ -2173,17 +2240,18 @@ ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v5, 2 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, 1 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: v_dual_mov_b32 v6, 3 :: v_dual_mov_b32 v7, 4 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i64@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i64@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: v_readlane_b32 s0, v41, 0 @@ -2210,8 +2278,9 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 3 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[0:1], off @@ -2252,6 +2321,7 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x4400 @@ -2284,8 +2354,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x4400 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] @@ -2318,8 +2389,9 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x4400 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] @@ -2352,8 +2424,9 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x4400 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] @@ -2388,6 +2461,7 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 4.0 @@ -2420,8 +2494,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_mov_b32_e32 v0, 4.0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] @@ -2454,8 +2529,9 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: v_mov_b32_e32 v0, 4.0 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] @@ -2488,8 +2564,9 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 4.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] @@ -2524,6 +2601,7 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1.0 @@ -2557,15 +2635,16 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2f32@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 @@ -2592,16 +2671,17 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2f32@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: v_readlane_b32 s0, v41, 0 @@ -2626,15 +2706,16 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2f32@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 @@ -2663,6 +2744,7 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1.0 @@ -2697,8 +2779,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 @@ -2733,17 +2816,18 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3f32@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: v_readlane_b32 s0, v41, 0 @@ -2768,8 +2852,9 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 @@ -2806,6 +2891,7 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1.0 @@ -2842,8 +2928,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX10-NEXT: v_mov_b32_e32 v3, -1.0 @@ -2880,8 +2967,9 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v2, 4.0 :: v_dual_mov_b32 v3, -1.0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0.5 ; GFX11-NEXT: s_add_i32 s32, s32, 16 @@ -2916,8 +3004,9 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, -1.0 @@ -2956,6 +3045,7 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -2989,15 +3079,16 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x40100000 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_f64@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_f64@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 @@ -3024,16 +3115,17 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x40100000 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_f64@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_f64@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: v_readlane_b32 s0, v41, 0 @@ -3058,15 +3150,16 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x40100000 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f64@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_f64@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 @@ -3095,6 +3188,7 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -3130,8 +3224,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0x40100000 @@ -3167,8 +3262,9 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 2.0 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40100000 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 @@ -3202,8 +3298,9 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 0x40100000 @@ -3241,6 +3338,7 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -3278,8 +3376,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0x40100000 @@ -3317,8 +3416,9 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 2.0 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40100000 ; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 0x40200000 ; GFX11-NEXT: s_add_i32 s32, s32, 16 @@ -3353,8 +3453,9 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 0x40100000 @@ -3395,6 +3496,7 @@ ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 @@ -3427,8 +3529,9 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i16@rel32@lo+4 @@ -3461,8 +3564,9 @@ ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i16@rel32@lo+4 @@ -3495,8 +3599,9 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_dword v0, v[0:1], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i16@rel32@lo+4 @@ -3532,6 +3637,7 @@ ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 @@ -3564,8 +3670,9 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i16@rel32@lo+4 @@ -3598,8 +3705,9 @@ ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i16@rel32@lo+4 @@ -3632,8 +3740,9 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i16@rel32@lo+4 @@ -3669,6 +3778,7 @@ ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 @@ -3701,8 +3811,9 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f16@rel32@lo+4 @@ -3735,8 +3846,9 @@ ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f16@rel32@lo+4 @@ -3769,8 +3881,9 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f16@rel32@lo+4 @@ -3805,6 +3918,7 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x20001 @@ -3838,15 +3952,16 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x20001 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 3 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3i16@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 @@ -3873,16 +3988,17 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: v_dual_mov_b32 v0, 0x20001 :: v_dual_mov_b32 v1, 3 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i16@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i16@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: v_readlane_b32 s0, v41, 0 @@ -3907,15 +4023,16 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x20001 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 3 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i16@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 @@ -3944,6 +4061,7 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x40003c00 @@ -3977,15 +4095,16 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x40003c00 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x4400 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3f16@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 @@ -4012,17 +4131,18 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x40003c00 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0x4400 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f16@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3f16@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: v_readlane_b32 s0, v41, 0 @@ -4047,15 +4167,16 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x40003c00 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x4400 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3f16@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 @@ -4085,6 +4206,7 @@ ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 @@ -4117,8 +4239,9 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i16@rel32@lo+4 @@ -4151,8 +4274,9 @@ ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i16@rel32@lo+4 @@ -4185,8 +4309,9 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i16@rel32@lo+4 @@ -4221,6 +4346,7 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x20001 @@ -4254,15 +4380,16 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x20001 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x40003 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v4i16@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 @@ -4289,17 +4416,18 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x20001 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0x40003 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i16@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i16@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: v_readlane_b32 s0, v41, 0 @@ -4324,15 +4452,16 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x20001 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x40003 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i16@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 @@ -4362,6 +4491,7 @@ ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 @@ -4394,8 +4524,9 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f16@rel32@lo+4 @@ -4428,8 +4559,9 @@ ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f16@rel32@lo+4 @@ -4462,8 +4594,9 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_dword v0, v[0:1], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f16@rel32@lo+4 @@ -4499,6 +4632,7 @@ ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 @@ -4531,8 +4665,9 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i32@rel32@lo+4 @@ -4565,8 +4700,9 @@ ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i32@rel32@lo+4 @@ -4599,8 +4735,9 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i32@rel32@lo+4 @@ -4635,6 +4772,7 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1 @@ -4668,15 +4806,16 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i32@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 @@ -4703,16 +4842,17 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i32@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: v_readlane_b32 s0, v41, 0 @@ -4737,15 +4877,16 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i32@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 @@ -4774,6 +4915,7 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 3 @@ -4808,8 +4950,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_mov_b32_e32 v0, 3 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 4 ; GFX10-NEXT: v_mov_b32_e32 v2, 5 ; GFX10-NEXT: s_addk_i32 s32, 0x200 @@ -4844,17 +4987,18 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 4 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_mov_b32_e32 v2, 5 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: v_readlane_b32 s0, v41, 0 @@ -4879,8 +5023,9 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 4 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 5 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 @@ -4917,6 +5062,7 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 3 @@ -4952,8 +5098,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_mov_b32_e32 v0, 3 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 4 ; GFX10-NEXT: v_mov_b32_e32 v2, 5 ; GFX10-NEXT: v_mov_b32_e32 v3, 6 @@ -4989,8 +5136,9 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 4 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v2, 5 :: v_dual_mov_b32 v3, 6 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 @@ -5024,8 +5172,9 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 4 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 5 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 6 @@ -5064,6 +5213,7 @@ ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 @@ -5096,8 +5246,9 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i32@rel32@lo+4 @@ -5130,8 +5281,9 @@ ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i32@rel32@lo+4 @@ -5164,8 +5316,9 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i32@rel32@lo+4 @@ -5200,6 +5353,7 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1 @@ -5235,8 +5389,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-NEXT: v_mov_b32_e32 v3, 4 @@ -5272,8 +5427,9 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 @@ -5307,8 +5463,9 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 4 @@ -5346,6 +5503,7 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1 @@ -5382,8 +5540,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-NEXT: v_mov_b32_e32 v3, 4 @@ -5420,8 +5579,9 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4 ; GFX11-NEXT: v_mov_b32_e32 v4, 5 ; GFX11-NEXT: s_add_i32 s32, s32, 16 @@ -5456,8 +5616,9 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 4 @@ -5499,6 +5660,7 @@ ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -5535,8 +5697,9 @@ ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v8, s[34:35] @@ -5574,8 +5737,9 @@ ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[0:1] @@ -5613,8 +5777,9 @@ ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v8, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_clause 0x1 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v8, s[0:1] @@ -5653,6 +5818,7 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1 @@ -5692,8 +5858,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-NEXT: v_mov_b32_e32 v3, 4 @@ -5733,8 +5900,9 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4 ; GFX11-NEXT: v_dual_mov_b32 v4, 5 :: v_dual_mov_b32 v5, 6 ; GFX11-NEXT: v_dual_mov_b32 v6, 7 :: v_dual_mov_b32 v7, 8 @@ -5770,8 +5938,9 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 4 @@ -5816,6 +5985,7 @@ ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v16, 0 +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -5854,8 +6024,9 @@ ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v16, 0 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x3 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v16, s[34:35] @@ -5895,8 +6066,9 @@ ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v12, 0 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: global_load_b128 v[0:3], v12, s[0:1] @@ -5936,8 +6108,9 @@ ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v16, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_clause 0x3 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] @@ -5981,6 +6154,7 @@ ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v28, 0 +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -6024,8 +6198,9 @@ ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v32, 0 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x7 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v32, s[34:35] @@ -6069,8 +6244,9 @@ ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v28, 0 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x7 ; GFX11-NEXT: global_load_b128 v[0:3], v28, s[0:1] @@ -6114,8 +6290,9 @@ ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v32, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_clause 0x7 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] @@ -6164,6 +6341,7 @@ ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v28, 0 ; GFX9-NEXT: global_load_dword v32, v[0:1], off +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v28, s[34:35] @@ -6209,8 +6387,9 @@ ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v32, 0 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: global_load_dword v33, v[0:1], off ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x7 @@ -6257,8 +6436,9 @@ ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v28, 0 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: global_load_b32 v32, v[0:1], off ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x7 @@ -6304,8 +6484,9 @@ ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v32, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: global_load_dword v33, v[0:1], off ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_clause 0x7 @@ -6351,31 +6532,32 @@ ; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: s_addk_i32 s32, 0x800 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_mov_b32_e32 v41, v0 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: v_writelane_b32 v42, s30, 0 +; GFX9-NEXT: v_mov_b32_e32 v40, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_writelane_b32 v43, s34, 0 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: v_mov_b32_e32 v42, v1 +; GFX9-NEXT: v_writelane_b32 v42, s31, 1 +; GFX9-NEXT: v_mov_b32_e32 v41, v1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_i32_func_i32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s35, s35, external_i32_func_i32@rel32@hi+12 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: global_store_dword v[41:42], v0, off +; GFX9-NEXT: global_store_dword v[40:41], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s31, v42, 1 +; GFX9-NEXT: v_readlane_b32 s30, v42, 0 ; GFX9-NEXT: v_readlane_b32 s34, v43, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xf800 @@ -6390,34 +6572,35 @@ ; GFX10-NEXT: s_mov_b32 s34, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_mov_b32_e32 v41, v0 +; GFX10-NEXT: ; implicit-def: $vgpr42 +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: v_writelane_b32 v42, s30, 0 +; GFX10-NEXT: v_mov_b32_e32 v40, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: s_addk_i32 s32, 0x400 ; GFX10-NEXT: v_writelane_b32 v43, s34, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: v_mov_b32_e32 v42, v1 +; GFX10-NEXT: v_writelane_b32 v42, s31, 1 +; GFX10-NEXT: v_mov_b32_e32 v41, v1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_i32_func_i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_i32_func_i32@rel32@hi+12 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: global_store_dword v[41:42], v0, off +; GFX10-NEXT: global_store_dword v[40:41], v0, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 +; GFX10-NEXT: v_readlane_b32 s31, v42, 1 +; GFX10-NEXT: v_readlane_b32 s30, v42, 0 ; GFX10-NEXT: v_readlane_b32 s34, v43, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 +; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 @@ -6434,34 +6617,35 @@ ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:8 ; GFX11-NEXT: scratch_store_b32 off, v43, s33 offset:12 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr42 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 -; GFX11-NEXT: scratch_store_b32 off, v42, s33 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_dual_mov_b32 v42, v1 :: v_dual_mov_b32 v41, v0 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 +; GFX11-NEXT: v_writelane_b32 v42, s30, 0 +; GFX11-NEXT: v_dual_mov_b32 v41, v1 :: v_dual_mov_b32 v40, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: s_add_i32 s32, s32, 32 ; GFX11-NEXT: v_writelane_b32 v43, s0, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: v_writelane_b32 v42, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_i32_func_i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_i32_func_i32@rel32@hi+12 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: global_store_b32 v[41:42], v0, off dlc +; GFX11-NEXT: global_store_b32 v[40:41], v0, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v42, off, s33 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:4 +; GFX11-NEXT: v_readlane_b32 s31, v42, 1 +; GFX11-NEXT: v_readlane_b32 s30, v42, 0 ; GFX11-NEXT: v_readlane_b32 s0, v43, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:8 +; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; GFX11-NEXT: scratch_load_b32 v43, off, s33 offset:12 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_addk_i32 s32, 0xffe0 @@ -6476,34 +6660,35 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 offset:8 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v42, s33 offset:8 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v43, s33 offset:12 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v42, s33 ; 4-byte Folded Spill -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v41, v0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr42 +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 offset:4 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s30, 0 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v40, v0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v43, s0, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v42, v1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s31, 1 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v41, v1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_i32_func_i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_i32_func_i32@rel32@hi+12 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX10-SCRATCH-NEXT: global_store_dword v[41:42], v0, off +; GFX10-SCRATCH-NEXT: global_store_dword v[40:41], v0, off ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v42, off, s33 -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v42, 0 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v43, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: s_clause 0x1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:8 +; GFX10-SCRATCH-NEXT: scratch_load_dword v42, off, s33 offset:8 ; GFX10-SCRATCH-NEXT: scratch_load_dword v43, off, s33 offset:12 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 @@ -6529,6 +6714,7 @@ ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -6565,8 +6751,9 @@ ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_ubyte v0, v2, s[34:35] @@ -6604,8 +6791,9 @@ ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_u8 v0, v1, s[0:1] @@ -6643,8 +6831,9 @@ ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_clause 0x1 ; GFX10-SCRATCH-NEXT: global_load_ubyte v0, v2, s[0:1] @@ -6684,6 +6873,7 @@ ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: v_mov_b32_e32 v0, 3 +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s33 ; GFX9-NEXT: v_mov_b32_e32 v0, 8 ; GFX9-NEXT: s_addk_i32 s32, 0x800 @@ -6721,16 +6911,17 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_mov_b32_e32 v0, 3 ; GFX10-NEXT: v_mov_b32_e32 v1, 8 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: s_addk_i32 s32, 0x400 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s33 ; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 ; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_byval_struct_i8_i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_byval_struct_i8_i32@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 @@ -6758,19 +6949,20 @@ ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:12 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 8 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: s_add_i32 s32, s32, 32 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: scratch_store_b8 off, v0, s33 ; GFX11-NEXT: scratch_store_b32 off, v1, s33 offset:4 ; GFX11-NEXT: v_mov_b32_e32 v0, s33 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_byval_struct_i8_i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_byval_struct_i8_i32@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: v_readlane_b32 s0, v41, 0 @@ -6797,16 +6989,17 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 3 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 8 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s33 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v1, s33 offset:4 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, s33 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_byval_struct_i8_i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_byval_struct_i8_i32@rel32@hi+12 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 @@ -6843,6 +7036,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 3 ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s33 ; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s33 ; GFX9-NEXT: s_addk_i32 s32, 0x800 @@ -6887,8 +7081,9 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_mov_b32_e32 v0, 3 ; GFX10-NEXT: v_mov_b32_e32 v1, 8 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: s_addk_i32 s32, 0x400 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s33 ; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 @@ -6940,10 +7135,11 @@ ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 ; GFX11-NEXT: s_add_i32 vcc_lo, s33, 8 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: scratch_store_b8 off, v0, s33 ; GFX11-NEXT: scratch_store_b32 off, v1, s33 offset:4 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v0, vcc_lo :: v_dual_mov_b32 v1, s33 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -6987,9 +7183,10 @@ ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 ; GFX10-SCRATCH-NEXT: s_add_i32 vcc_lo, s33, 8 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s33 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v1, s33 offset:4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, vcc_lo ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, s33 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 @@ -7045,6 +7242,7 @@ ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 @@ -7099,8 +7297,9 @@ ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[34:35] @@ -7156,8 +7355,9 @@ ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v0, s[0:1] @@ -7209,8 +7409,9 @@ ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1] @@ -7261,7 +7462,7 @@ ; GFX9-LABEL: tail_call_byval_align16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s6, s33 +; GFX9-NEXT: s_mov_b32 s8, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill @@ -7269,6 +7470,8 @@ ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:20 ; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:16 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s33 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: s_addk_i32 s32, 0x800 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: v_writelane_b32 v40, s34, 2 @@ -7299,7 +7502,6 @@ ; GFX9-NEXT: v_writelane_b32 v40, s59, 27 ; GFX9-NEXT: v_writelane_b32 v40, s60, 28 ; GFX9-NEXT: v_writelane_b32 v40, s61, 29 -; GFX9-NEXT: s_addk_i32 s32, 0x800 ; GFX9-NEXT: v_writelane_b32 v40, s62, 30 ; GFX9-NEXT: v_writelane_b32 v40, s63, 31 ; GFX9-NEXT: s_getpc_b64 s[4:5] @@ -7346,7 +7548,7 @@ ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0xf800 -; GFX9-NEXT: s_mov_b32 s33, s6 +; GFX9-NEXT: s_mov_b32 s33, s8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -7354,7 +7556,7 @@ ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_mov_b32 s6, s33 +; GFX10-NEXT: s_mov_b32 s7, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill @@ -7364,8 +7566,9 @@ ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:20 ; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:16 ; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s33 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: s_addk_i32 s32, 0x400 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, byval_align16_f64_arg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, byval_align16_f64_arg@rel32@hi+12 @@ -7442,7 +7645,7 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_addk_i32 s32, 0xfc00 -; GFX10-NEXT: s_mov_b32 s33, s6 +; GFX10-NEXT: s_mov_b32 s33, s7 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -7450,7 +7653,7 @@ ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_mov_b32 s4, s33 +; GFX11-NEXT: s_mov_b32 s5, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:24 ; 4-byte Folded Spill @@ -7458,8 +7661,9 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: scratch_load_b64 v[32:33], off, s33 offset:16 ; GFX11-NEXT: scratch_load_b32 v31, off, s33 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: s_add_i32 s32, s32, 32 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, byval_align16_f64_arg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, byval_align16_f64_arg@rel32@hi+12 @@ -7533,7 +7737,7 @@ ; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:24 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_addk_i32 s32, 0xffe0 -; GFX11-NEXT: s_mov_b32 s33, s4 +; GFX11-NEXT: s_mov_b32 s33, s5 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -7541,7 +7745,7 @@ ; GFX10-SCRATCH: ; %bb.0: ; %entry ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, s33 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 offset:24 ; 4-byte Folded Spill @@ -7550,8 +7754,9 @@ ; GFX10-SCRATCH-NEXT: s_clause 0x1 ; GFX10-SCRATCH-NEXT: scratch_load_dwordx2 v[32:33], off, s33 offset:16 ; GFX10-SCRATCH-NEXT: scratch_load_dword v31, off, s33 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, byval_align16_f64_arg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, byval_align16_f64_arg@rel32@hi+12 @@ -7626,7 +7831,7 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_addk_i32 s32, 0xffe0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s4 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s5 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] entry: @@ -7646,6 +7851,7 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1 @@ -7679,15 +7885,16 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i1_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i1_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 @@ -7714,16 +7921,18 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: v_mov_b32_e32 v0, 1 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i1_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i1_inreg@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: scratch_store_b8 off, v0, s32 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: v_readlane_b32 s0, v41, 0 @@ -7748,15 +7957,16 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i1_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i1_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 @@ -7785,8 +7995,9 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7b ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 @@ -7819,9 +8030,10 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: ; implicit-def: $vgpr40 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_movk_i32 s4, 0x7b -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i8_inreg@rel32@lo+4 @@ -7855,9 +8067,10 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr40 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_movk_i32 s4, 0x7b -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i8_inreg@rel32@lo+4 @@ -7891,9 +8104,10 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i8_inreg@rel32@lo+4 @@ -7929,8 +8143,9 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7b ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 @@ -7963,9 +8178,10 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: ; implicit-def: $vgpr40 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_movk_i32 s4, 0x7b -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i16_inreg@rel32@lo+4 @@ -7999,9 +8215,10 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr40 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_movk_i32 s4, 0x7b -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i16_inreg@rel32@lo+4 @@ -8035,9 +8252,10 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i16_inreg@rel32@lo+4 @@ -8073,8 +8291,9 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: s_mov_b32 s4, 42 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 @@ -8107,9 +8326,10 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: ; implicit-def: $vgpr40 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 42 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i32_inreg@rel32@lo+4 @@ -8143,9 +8363,10 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr40 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, 42 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i32_inreg@rel32@lo+4 @@ -8179,9 +8400,10 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 42 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i32_inreg@rel32@lo+4 @@ -8217,9 +8439,10 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: s_movk_i32 s4, 0x7b ; GFX9-NEXT: s_mov_b32 s5, 0 @@ -8254,9 +8477,10 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: ; implicit-def: $vgpr40 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_movk_i32 s4, 0x7b -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i64_inreg@rel32@lo+4 @@ -8293,9 +8517,10 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr40 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_movk_i32 s4, 0x7b -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i64_inreg@rel32@lo+4 @@ -8332,9 +8557,10 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i64_inreg@rel32@lo+4 @@ -8373,9 +8599,10 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 @@ -8414,8 +8641,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b64 s[34:35], 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 @@ -8457,8 +8685,9 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 @@ -8500,8 +8729,9 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b64 s[0:1], 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 @@ -8546,11 +8776,12 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 4 ; GFX9-NEXT: s_mov_b32 s4, 1 ; GFX9-NEXT: s_mov_b32 s5, 2 @@ -8589,9 +8820,10 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: ; implicit-def: $vgpr40 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 1 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i64_inreg@rel32@lo+4 @@ -8634,9 +8866,10 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr40 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, 1 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i64_inreg@rel32@lo+4 @@ -8679,9 +8912,10 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i64_inreg@rel32@lo+4 @@ -8726,9 +8960,10 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 @@ -8773,8 +9008,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b64 s[34:35], 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 @@ -8822,8 +9058,9 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 @@ -8871,8 +9108,9 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b64 s[0:1], 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 @@ -8925,10 +9163,11 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_writelane_b32 v40, s8, 4 @@ -8978,8 +9217,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b64 s[34:35], 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 @@ -9033,8 +9273,9 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 @@ -9088,8 +9329,9 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b64 s[0:1], 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 @@ -9147,8 +9389,9 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x4400 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 @@ -9181,9 +9424,10 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: ; implicit-def: $vgpr40 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_movk_i32 s4, 0x4400 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_f16_inreg@rel32@lo+4 @@ -9217,9 +9461,10 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr40 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_movk_i32 s4, 0x4400 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_f16_inreg@rel32@lo+4 @@ -9253,9 +9498,10 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x4400 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f16_inreg@rel32@lo+4 @@ -9291,8 +9537,9 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: s_mov_b32 s4, 4.0 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 @@ -9325,9 +9572,10 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: ; implicit-def: $vgpr40 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 4.0 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_f32_inreg@rel32@lo+4 @@ -9361,9 +9609,10 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr40 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, 4.0 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_f32_inreg@rel32@lo+4 @@ -9397,9 +9646,10 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 4.0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f32_inreg@rel32@lo+4 @@ -9435,9 +9685,10 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: s_mov_b32 s4, 1.0 ; GFX9-NEXT: s_mov_b32 s5, 2.0 @@ -9472,9 +9723,10 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: ; implicit-def: $vgpr40 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 1.0 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f32_inreg@rel32@lo+4 @@ -9511,9 +9763,10 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr40 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, 1.0 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f32_inreg@rel32@lo+4 @@ -9550,9 +9803,10 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f32_inreg@rel32@lo+4 @@ -9591,10 +9845,11 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 3 ; GFX9-NEXT: s_mov_b32 s4, 1.0 ; GFX9-NEXT: s_mov_b32 s5, 2.0 @@ -9631,9 +9886,10 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: ; implicit-def: $vgpr40 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 1.0 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f32_inreg@rel32@lo+4 @@ -9673,9 +9929,10 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr40 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, 1.0 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f32_inreg@rel32@lo+4 @@ -9715,9 +9972,10 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f32_inreg@rel32@lo+4 @@ -9759,12 +10017,13 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 ; GFX9-NEXT: v_writelane_b32 v40, s8, 4 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 5 ; GFX9-NEXT: s_mov_b32 s4, 1.0 ; GFX9-NEXT: s_mov_b32 s5, 2.0 @@ -9805,9 +10064,10 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: ; implicit-def: $vgpr40 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 1.0 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v5f32_inreg@rel32@lo+4 @@ -9853,9 +10113,10 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr40 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, 1.0 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v5f32_inreg@rel32@lo+4 @@ -9901,9 +10162,10 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v5f32_inreg@rel32@lo+4 @@ -9951,9 +10213,10 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: s_mov_b32 s5, 0x40100000 @@ -9988,9 +10251,10 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: ; implicit-def: $vgpr40 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_f64_inreg@rel32@lo+4 @@ -10027,9 +10291,10 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr40 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_f64_inreg@rel32@lo+4 @@ -10066,9 +10331,10 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f64_inreg@rel32@lo+4 @@ -10107,11 +10373,12 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 4 ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: s_mov_b32 s5, 2.0 @@ -10150,9 +10417,10 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: ; implicit-def: $vgpr40 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f64_inreg@rel32@lo+4 @@ -10195,9 +10463,10 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr40 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f64_inreg@rel32@lo+4 @@ -10240,9 +10509,10 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f64_inreg@rel32@lo+4 @@ -10287,13 +10557,14 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 ; GFX9-NEXT: v_writelane_b32 v40, s8, 4 ; GFX9-NEXT: v_writelane_b32 v40, s9, 5 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 6 ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: s_mov_b32 s5, 2.0 @@ -10336,9 +10607,10 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: ; implicit-def: $vgpr40 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f64_inreg@rel32@lo+4 @@ -10387,9 +10659,10 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr40 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, 0 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f64_inreg@rel32@lo+4 @@ -10438,9 +10711,10 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f64_inreg@rel32@lo+4 @@ -10491,9 +10765,10 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_load_dword s4, s[34:35], 0x0 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 @@ -10525,9 +10800,10 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: ; implicit-def: $vgpr40 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_load_dword s4, s[34:35], 0x0 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i16_inreg@rel32@lo+4 @@ -10561,9 +10837,10 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr40 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i16_inreg@rel32@lo+4 @@ -10597,9 +10874,10 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i16_inreg@rel32@lo+4 @@ -10636,10 +10914,11 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 @@ -10672,8 +10951,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 @@ -10710,8 +10990,9 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 @@ -10748,8 +11029,9 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 @@ -10789,10 +11071,11 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 @@ -10825,8 +11108,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 @@ -10863,8 +11147,9 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 @@ -10901,8 +11186,9 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 @@ -10942,9 +11228,10 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: s_mov_b32 s4, 0x20001 ; GFX9-NEXT: s_mov_b32 s5, 3 @@ -10979,9 +11266,10 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: ; implicit-def: $vgpr40 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 0x20001 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i16_inreg@rel32@lo+4 @@ -11018,9 +11306,10 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr40 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, 0x20001 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i16_inreg@rel32@lo+4 @@ -11057,9 +11346,10 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x20001 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i16_inreg@rel32@lo+4 @@ -11098,9 +11388,10 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: s_mov_b32 s4, 0x40003c00 ; GFX9-NEXT: s_movk_i32 s5, 0x4400 @@ -11135,9 +11426,10 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: ; implicit-def: $vgpr40 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 0x40003c00 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f16_inreg@rel32@lo+4 @@ -11174,9 +11466,10 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr40 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, 0x40003c00 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f16_inreg@rel32@lo+4 @@ -11213,9 +11506,10 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x40003c00 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f16_inreg@rel32@lo+4 @@ -11254,10 +11548,11 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 @@ -11290,8 +11585,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 @@ -11328,8 +11624,9 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 @@ -11366,8 +11663,9 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 @@ -11407,9 +11705,10 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: s_mov_b32 s4, 0x20001 ; GFX9-NEXT: s_mov_b32 s5, 0x40003 @@ -11444,9 +11743,10 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: ; implicit-def: $vgpr40 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 0x20001 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i16_inreg@rel32@lo+4 @@ -11483,9 +11783,10 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr40 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, 0x20001 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i16_inreg@rel32@lo+4 @@ -11522,9 +11823,10 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x20001 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i16_inreg@rel32@lo+4 @@ -11563,9 +11865,10 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_load_dword s4, s[34:35], 0x0 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 @@ -11597,9 +11900,10 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: ; implicit-def: $vgpr40 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_load_dword s4, s[34:35], 0x0 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f16_inreg@rel32@lo+4 @@ -11633,9 +11937,10 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr40 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f16_inreg@rel32@lo+4 @@ -11669,9 +11974,10 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f16_inreg@rel32@lo+4 @@ -11708,10 +12014,11 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 @@ -11744,8 +12051,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 @@ -11782,8 +12090,9 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 @@ -11820,8 +12129,9 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 @@ -11861,9 +12171,10 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: s_mov_b32 s4, 1 ; GFX9-NEXT: s_mov_b32 s5, 2 @@ -11898,9 +12209,10 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: ; implicit-def: $vgpr40 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 1 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i32_inreg@rel32@lo+4 @@ -11937,9 +12249,10 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr40 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, 1 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i32_inreg@rel32@lo+4 @@ -11976,9 +12289,10 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i32_inreg@rel32@lo+4 @@ -12017,10 +12331,11 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 3 ; GFX9-NEXT: s_mov_b32 s4, 3 ; GFX9-NEXT: s_mov_b32 s5, 4 @@ -12057,9 +12372,10 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: ; implicit-def: $vgpr40 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 3 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i32_inreg@rel32@lo+4 @@ -12099,9 +12415,10 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr40 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, 3 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_inreg@rel32@lo+4 @@ -12141,9 +12458,10 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 3 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_inreg@rel32@lo+4 @@ -12185,11 +12503,12 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 4 ; GFX9-NEXT: s_mov_b32 s4, 3 ; GFX9-NEXT: s_mov_b32 s5, 4 @@ -12228,9 +12547,10 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: ; implicit-def: $vgpr40 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 3 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i32_i32_inreg@rel32@lo+4 @@ -12273,9 +12593,10 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr40 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, 3 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_i32_inreg@rel32@lo+4 @@ -12318,9 +12639,10 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 3 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_i32_inreg@rel32@lo+4 @@ -12365,12 +12687,13 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 4 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 5 @@ -12405,8 +12728,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 @@ -12447,8 +12771,9 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 @@ -12489,8 +12814,9 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 @@ -12534,11 +12860,12 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 4 ; GFX9-NEXT: s_mov_b32 s4, 1 ; GFX9-NEXT: s_mov_b32 s5, 2 @@ -12577,9 +12904,10 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: ; implicit-def: $vgpr40 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 1 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i32_inreg@rel32@lo+4 @@ -12622,9 +12950,10 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr40 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, 1 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i32_inreg@rel32@lo+4 @@ -12667,9 +12996,10 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i32_inreg@rel32@lo+4 @@ -12714,12 +13044,13 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 ; GFX9-NEXT: v_writelane_b32 v40, s8, 4 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 5 ; GFX9-NEXT: s_mov_b32 s4, 1 ; GFX9-NEXT: s_mov_b32 s5, 2 @@ -12760,9 +13091,10 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: ; implicit-def: $vgpr40 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 1 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v5i32_inreg@rel32@lo+4 @@ -12808,9 +13140,10 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr40 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, 1 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v5i32_inreg@rel32@lo+4 @@ -12856,9 +13189,10 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v5i32_inreg@rel32@lo+4 @@ -12906,9 +13240,10 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 @@ -12956,8 +13291,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 @@ -13008,8 +13344,9 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 @@ -13060,8 +13397,9 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 @@ -13116,6 +13454,8 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 @@ -13124,7 +13464,6 @@ ; GFX9-NEXT: v_writelane_b32 v40, s9, 5 ; GFX9-NEXT: v_writelane_b32 v40, s10, 6 ; GFX9-NEXT: v_writelane_b32 v40, s11, 7 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 8 ; GFX9-NEXT: s_mov_b32 s4, 1 ; GFX9-NEXT: s_mov_b32 s5, 2 @@ -13171,9 +13510,10 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 +; GFX10-NEXT: ; implicit-def: $vgpr40 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_mov_b32 s4, 1 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v8i32_inreg@rel32@lo+4 @@ -13228,9 +13568,10 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr40 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_mov_b32 s4, 1 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v8i32_inreg@rel32@lo+4 @@ -13285,9 +13626,10 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v8i32_inreg@rel32@lo+4 @@ -13344,6 +13686,8 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 @@ -13354,7 +13698,6 @@ ; GFX9-NEXT: v_writelane_b32 v40, s11, 7 ; GFX9-NEXT: v_writelane_b32 v40, s12, 8 ; GFX9-NEXT: v_writelane_b32 v40, s13, 9 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s14, 10 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s15, 11 @@ -13410,8 +13753,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 @@ -13478,8 +13822,9 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 @@ -13546,8 +13891,9 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 @@ -13618,6 +13964,8 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 @@ -13633,7 +13981,6 @@ ; GFX9-NEXT: v_writelane_b32 v40, s16, 12 ; GFX9-NEXT: v_writelane_b32 v40, s17, 13 ; GFX9-NEXT: v_writelane_b32 v40, s18, 14 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s19, 15 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s20, 16 @@ -13728,8 +14075,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 @@ -13841,8 +14189,9 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 @@ -13948,8 +14297,9 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 @@ -14061,6 +14411,8 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 @@ -14075,7 +14427,6 @@ ; GFX9-NEXT: v_writelane_b32 v40, s15, 11 ; GFX9-NEXT: v_writelane_b32 v40, s16, 12 ; GFX9-NEXT: v_writelane_b32 v40, s17, 13 -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s18, 14 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s19, 15 @@ -14176,8 +14527,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 @@ -14294,8 +14646,9 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 @@ -14404,8 +14757,9 @@ ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 @@ -14525,6 +14879,7 @@ ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s33 ; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:4 +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x800 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 @@ -14563,8 +14918,9 @@ ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s33 ; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:4 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: s_addk_i32 s32, 0x400 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, stack_passed_f64_arg@rel32@lo+4 @@ -14601,8 +14957,9 @@ ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:12 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: scratch_load_b64 v[32:33], off, s33 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: s_add_i32 s32, s32, 32 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, stack_passed_f64_arg@rel32@lo+4 @@ -14636,8 +14993,9 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: scratch_load_dwordx2 v[32:33], off, s33 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, stack_passed_f64_arg@rel32@lo+4 @@ -14680,6 +15038,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 13 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, 14 +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -14750,11 +15109,12 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, 14 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_mov_b32_e32 v3, 15 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -14821,8 +15181,9 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, 12 :: v_dual_mov_b32 v1, 13 ; GFX11-NEXT: v_dual_mov_b32 v2, 14 :: v_dual_mov_b32 v3, 15 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: v_dual_mov_b32 v4, 1 :: v_dual_mov_b32 v5, 1 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 1 @@ -14875,8 +15236,9 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 14 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 15 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 1 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 1 ; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 @@ -14969,6 +15331,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 13 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 ; GFX9-NEXT: v_mov_b32_e32 v0, 14 +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -15045,6 +15408,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, 11 ; GFX10-NEXT: v_mov_b32_e32 v1, 12 ; GFX10-NEXT: v_mov_b32_e32 v2, 13 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_mov_b32_e32 v4, 15 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 @@ -15120,10 +15484,11 @@ ; GFX11-NEXT: v_dual_mov_b32 v4, 8 :: v_dual_mov_b32 v5, 9 ; GFX11-NEXT: v_dual_mov_b32 v6, 10 :: v_dual_mov_b32 v7, 11 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16 ; GFX11-NEXT: scratch_store_b128 off, v[4:7], s32 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0 ; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 1 @@ -15180,9 +15545,10 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 10 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 11 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 ; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[4:7], s32 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0 @@ -15271,6 +15637,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 0x41500000 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x41600000 +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x41700000 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -15347,6 +15714,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, 0x41300000 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x41400000 ; GFX10-NEXT: v_mov_b32_e32 v2, 0x41500000 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: v_mov_b32_e32 v4, 0x41700000 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 @@ -15426,11 +15794,12 @@ ; GFX11-NEXT: v_mov_b32_e32 v6, 0x41200000 ; GFX11-NEXT: v_mov_b32_e32 v7, 0x41300000 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16 ; GFX11-NEXT: scratch_store_b128 off, v[4:7], s32 ; GFX11-NEXT: v_mov_b32_e32 v6, 1.0 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0 ; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 1.0 @@ -15488,9 +15857,10 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 0x41200000 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 0x41300000 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 ; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 ; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[4:7], s32 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0 diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll @@ -15,9 +15,10 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 -; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 @@ -53,8 +54,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_getpc_b64 s[4:5] @@ -93,8 +95,9 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_getpc_b64 s[4:5] @@ -134,6 +137,7 @@ ; GFX9-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: v_writelane_b32 v0, s28, 0 ; GFX9-NEXT: v_writelane_b32 v0, s29, 1 ; GFX9-NEXT: v_writelane_b32 v0, s30, 2 @@ -162,6 +166,7 @@ ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 +; GFX10-NEXT: ; implicit-def: $vgpr0 ; GFX10-NEXT: v_writelane_b32 v0, s28, 0 ; GFX10-NEXT: v_writelane_b32 v0, s29, 1 ; GFX10-NEXT: v_writelane_b32 v0, s30, 2 @@ -188,9 +193,10 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-NEXT: s_xor_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr0 ; GFX11-NEXT: v_writelane_b32 v0, s28, 0 ; GFX11-NEXT: v_writelane_b32 v0, s29, 1 ; GFX11-NEXT: v_writelane_b32 v0, s30, 2 @@ -206,9 +212,9 @@ ; GFX11-NEXT: v_readlane_b32 s30, v0, 2 ; GFX11-NEXT: v_readlane_b32 s29, v0, 1 ; GFX11-NEXT: v_readlane_b32 s28, v0, 0 -; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-NEXT: s_xor_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -232,8 +238,9 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 @@ -273,8 +280,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 @@ -316,8 +324,9 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 @@ -359,32 +368,33 @@ ; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr41 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v41, s30, 0 ; GFX9-NEXT: v_writelane_b32 v42, s34, 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: v_writelane_b32 v41, s31, 1 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def v31 ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: v_mov_b32_e32 v41, v31 +; GFX9-NEXT: v_mov_b32_e32 v40, v31 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-NEXT: v_mov_b32_e32 v31, v41 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v31 ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s31, v41, 1 +; GFX9-NEXT: v_readlane_b32 s30, v41, 0 ; GFX9-NEXT: v_readlane_b32 s34, v42, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 @@ -399,34 +409,35 @@ ; GFX10-NEXT: s_mov_b32 s34, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr41 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s30, 0 ; GFX10-NEXT: v_writelane_b32 v42, s34, 0 -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def v31 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: v_mov_b32_e32 v41, v31 +; GFX10-NEXT: v_mov_b32_e32 v40, v31 +; GFX10-NEXT: v_writelane_b32 v41, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX10-NEXT: v_mov_b32_e32 v31, v41 +; GFX10-NEXT: v_mov_b32_e32 v31, v40 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use v31 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: v_readlane_b32 s31, v41, 1 +; GFX10-NEXT: v_readlane_b32 s30, v41, 0 ; GFX10-NEXT: v_readlane_b32 s34, v42, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 @@ -443,34 +454,35 @@ ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:8 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: ; implicit-def: $vgpr41 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s30, 0 ; GFX11-NEXT: v_writelane_b32 v42, s0, 0 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 ; 4-byte Folded Spill +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; def v31 ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: v_mov_b32_e32 v41, v31 +; GFX11-NEXT: v_mov_b32_e32 v40, v31 +; GFX11-NEXT: v_writelane_b32 v41, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: v_mov_b32_e32 v31, v41 +; GFX11-NEXT: v_mov_b32_e32 v31, v40 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use v31 ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: scratch_load_b32 v41, off, s33 ; 4-byte Folded Reload -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: v_readlane_b32 s31, v41, 1 +; GFX11-NEXT: v_readlane_b32 s30, v41, 0 ; GFX11-NEXT: v_readlane_b32 s0, v42, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 @@ -494,8 +506,9 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 @@ -535,8 +548,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 @@ -544,8 +558,8 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s33 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: s_mov_b32 s4, s33 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: s_mov_b32 s33, s4 @@ -578,8 +592,9 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 @@ -587,8 +602,8 @@ ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; def s33 ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: s_mov_b32 s4, s33 +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_mov_b32 s33, s4 @@ -625,8 +640,9 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: ;;#ASMSTART @@ -666,8 +682,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[36:37] ; GFX10-NEXT: s_add_u32 s36, s36, external_void_func_void@rel32@lo+4 @@ -675,8 +692,8 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s34 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: s_mov_b32 s4, s34 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[36:37] ; GFX10-NEXT: s_mov_b32 s34, s4 @@ -709,8 +726,9 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 @@ -718,8 +736,8 @@ ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; def s34 ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: s_mov_b32 s4, s34 +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_mov_b32 s34, s4 @@ -756,6 +774,7 @@ ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr41 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v41, s30, 0 ; GFX9-NEXT: v_writelane_b32 v42, s34, 0 @@ -795,17 +814,18 @@ ; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v41, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr41 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s30, 0 ; GFX10-NEXT: v_writelane_b32 v42, s34, 0 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def v40 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: v_writelane_b32 v41, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v41, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use v40 @@ -836,18 +856,18 @@ ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:8 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v41, s30, 0 +; GFX11-NEXT: ; implicit-def: $vgpr41 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s30, 0 ; GFX11-NEXT: v_writelane_b32 v42, s0, 0 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; def v40 ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_writelane_b32 v41, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v41, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use v40 @@ -875,17 +895,18 @@ ; GFX9-LABEL: void_func_void_clobber_s33: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_xor_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: v_writelane_b32 v0, s33, 0 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; clobber ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: v_readlane_b32 s33, v0, 0 -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_xor_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -893,19 +914,20 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX10-NEXT: s_xor_saveexec_b32 s5, -1 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s5 +; GFX10-NEXT: ; implicit-def: $vgpr0 ; GFX10-NEXT: v_writelane_b32 v0, s33, 0 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; clobber ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: v_readlane_b32 s33, v0, 0 -; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX10-NEXT: s_xor_saveexec_b32 s5, -1 ; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s5 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -914,18 +936,19 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-NEXT: s_xor_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr0 ; GFX11-NEXT: v_writelane_b32 v0, s33, 0 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; clobber ; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s33, v0, 0 -; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-NEXT: s_xor_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -937,17 +960,18 @@ ; GFX9-LABEL: void_func_void_clobber_s34: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_xor_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: v_writelane_b32 v0, s34, 0 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; clobber ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: v_readlane_b32 s34, v0, 0 -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_xor_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -955,19 +979,20 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX10-NEXT: s_xor_saveexec_b32 s5, -1 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s5 +; GFX10-NEXT: ; implicit-def: $vgpr0 ; GFX10-NEXT: v_writelane_b32 v0, s34, 0 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; clobber ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: v_readlane_b32 s34, v0, 0 -; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX10-NEXT: s_xor_saveexec_b32 s5, -1 ; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s5 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -976,18 +1001,19 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-NEXT: s_xor_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: ; implicit-def: $vgpr0 ; GFX11-NEXT: v_writelane_b32 v0, s34, 0 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; clobber ; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s34, v0, 0 -; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-NEXT: s_xor_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1005,6 +1031,7 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 @@ -1036,8 +1063,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, void_func_void_clobber_s33@rel32@lo+4 @@ -1069,8 +1097,9 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, void_func_void_clobber_s33@rel32@lo+4 @@ -1104,6 +1133,7 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 @@ -1135,8 +1165,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, void_func_void_clobber_s34@rel32@lo+4 @@ -1168,8 +1199,9 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, void_func_void_clobber_s34@rel32@lo+4 @@ -1203,8 +1235,9 @@ ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 @@ -1243,8 +1276,9 @@ ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: ; implicit-def: $vgpr40 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 @@ -1252,8 +1286,8 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s40 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: s_mov_b32 s4, s40 +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: ;;#ASMSTART @@ -1285,8 +1319,9 @@ ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: ; implicit-def: $vgpr40 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 @@ -1294,8 +1329,8 @@ ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; def s40 ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: s_mov_b32 s4, s40 +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: ;;#ASMSTART @@ -1328,15 +1363,16 @@ ; GFX9-NEXT: s_mov_b32 s34, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: ; implicit-def: $vgpr41 ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s30, 1 +; GFX9-NEXT: v_writelane_b32 v41, s4, 0 +; GFX9-NEXT: v_writelane_b32 v41, s30, 1 ; GFX9-NEXT: v_writelane_b32 v42, s34, 0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: v_writelane_b32 v40, s31, 2 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: v_writelane_b32 v41, s31, 2 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s40 ; GFX9-NEXT: ;;#ASMEND @@ -1344,7 +1380,7 @@ ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def v32 ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: v_mov_b32_e32 v41, v32 +; GFX9-NEXT: v_mov_b32_e32 v40, v32 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 @@ -1353,15 +1389,15 @@ ; GFX9-NEXT: ; use s4 ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v41 +; GFX9-NEXT: ; use v40 ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s31, v40, 2 -; GFX9-NEXT: v_readlane_b32 s30, v40, 1 -; GFX9-NEXT: v_readlane_b32 s4, v40, 0 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s31, v41, 2 +; GFX9-NEXT: v_readlane_b32 s30, v41, 1 +; GFX9-NEXT: v_readlane_b32 s4, v41, 0 ; GFX9-NEXT: v_readlane_b32 s34, v42, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 @@ -1376,42 +1412,43 @@ ; GFX10-NEXT: s_mov_b32 s34, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: ; implicit-def: $vgpr41 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s4, 0 ; GFX10-NEXT: v_writelane_b32 v42, s34, 0 -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s40 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: s_mov_b32 s4, s40 +; GFX10-NEXT: v_writelane_b32 v41, s30, 1 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def v32 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: v_mov_b32_e32 v41, v32 +; GFX10-NEXT: v_mov_b32_e32 v40, v32 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 2 +; GFX10-NEXT: v_writelane_b32 v41, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use s4 ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: ;;#ASMSTART -; GFX10-NEXT: ; use v41 +; GFX10-NEXT: ; use v40 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX10-NEXT: v_readlane_b32 s31, v40, 2 -; GFX10-NEXT: v_readlane_b32 s30, v40, 1 -; GFX10-NEXT: v_readlane_b32 s4, v40, 0 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: v_readlane_b32 s31, v41, 2 +; GFX10-NEXT: v_readlane_b32 s30, v41, 1 +; GFX10-NEXT: v_readlane_b32 s4, v41, 0 ; GFX10-NEXT: v_readlane_b32 s34, v42, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 @@ -1428,41 +1465,42 @@ ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:8 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: ; implicit-def: $vgpr41 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s4, 0 ; GFX11-NEXT: v_writelane_b32 v42, s0, 0 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 ; 4-byte Folded Spill +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; def s40 ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: s_mov_b32 s4, s40 +; GFX11-NEXT: v_writelane_b32 v41, s30, 1 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; def v32 ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_mov_b32_e32 v41, v32 +; GFX11-NEXT: v_mov_b32_e32 v40, v32 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 2 +; GFX11-NEXT: v_writelane_b32 v41, s31, 2 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use s4 ; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: ;;#ASMSTART -; GFX11-NEXT: ; use v41 +; GFX11-NEXT: ; use v40 ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: scratch_load_b32 v41, off, s33 ; 4-byte Folded Reload -; GFX11-NEXT: v_readlane_b32 s31, v40, 2 -; GFX11-NEXT: v_readlane_b32 s30, v40, 1 -; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: v_readlane_b32 s31, v41, 2 +; GFX11-NEXT: v_readlane_b32 s30, v41, 1 +; GFX11-NEXT: v_readlane_b32 s4, v41, 0 ; GFX11-NEXT: v_readlane_b32 s0, v42, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll @@ -34,6 +34,7 @@ ; GFX9-NEXT: s_add_u32 s34, s34, return_i1@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s35, s35, return_i1@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 +; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: v_writelane_b32 v1, s30, 0 ; GFX9-NEXT: v_writelane_b32 v1, s31, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -62,6 +63,7 @@ ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, return_i1@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, return_i1@gotpcrel32@hi+12 +; GFX10-NEXT: ; implicit-def: $vgpr1 ; GFX10-NEXT: v_writelane_b32 v1, s30, 0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_writelane_b32 v1, s31, 1 @@ -82,7 +84,7 @@ ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_mov_b32 s2, s33 +; GFX11-NEXT: s_mov_b32 s3, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_store_b32 off, v1, s33 ; 4-byte Folded Spill @@ -91,6 +93,7 @@ ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, return_i1@gotpcrel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, return_i1@gotpcrel32@hi+12 +; GFX11-NEXT: ; implicit-def: $vgpr1 ; GFX11-NEXT: v_writelane_b32 v1, s30, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_writelane_b32 v1, s31, 1 @@ -103,7 +106,7 @@ ; GFX11-NEXT: scratch_load_b32 v1, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: s_mov_b32 s33, s2 +; GFX11-NEXT: s_mov_b32 s33, s3 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: @@ -142,6 +145,7 @@ ; GFX9-NEXT: s_add_u32 s34, s34, return_i16@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s35, s35, return_i16@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 +; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: v_writelane_b32 v1, s30, 0 ; GFX9-NEXT: v_writelane_b32 v1, s31, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -170,6 +174,7 @@ ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, return_i16@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, return_i16@gotpcrel32@hi+12 +; GFX10-NEXT: ; implicit-def: $vgpr1 ; GFX10-NEXT: v_writelane_b32 v1, s30, 0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_writelane_b32 v1, s31, 1 @@ -190,7 +195,7 @@ ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_mov_b32 s2, s33 +; GFX11-NEXT: s_mov_b32 s3, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_store_b32 off, v1, s33 ; 4-byte Folded Spill @@ -199,6 +204,7 @@ ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, return_i16@gotpcrel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, return_i16@gotpcrel32@hi+12 +; GFX11-NEXT: ; implicit-def: $vgpr1 ; GFX11-NEXT: v_writelane_b32 v1, s30, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_writelane_b32 v1, s31, 1 @@ -211,7 +217,7 @@ ; GFX11-NEXT: scratch_load_b32 v1, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: s_mov_b32 s33, s2 +; GFX11-NEXT: s_mov_b32 s33, s3 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: @@ -250,6 +256,7 @@ ; GFX9-NEXT: s_add_u32 s34, s34, return_2xi16@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s35, s35, return_2xi16@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 +; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: v_writelane_b32 v1, s30, 0 ; GFX9-NEXT: v_writelane_b32 v1, s31, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -278,6 +285,7 @@ ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, return_2xi16@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, return_2xi16@gotpcrel32@hi+12 +; GFX10-NEXT: ; implicit-def: $vgpr1 ; GFX10-NEXT: v_writelane_b32 v1, s30, 0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_writelane_b32 v1, s31, 1 @@ -298,7 +306,7 @@ ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_mov_b32 s2, s33 +; GFX11-NEXT: s_mov_b32 s3, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_store_b32 off, v1, s33 ; 4-byte Folded Spill @@ -307,6 +315,7 @@ ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, return_2xi16@gotpcrel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, return_2xi16@gotpcrel32@hi+12 +; GFX11-NEXT: ; implicit-def: $vgpr1 ; GFX11-NEXT: v_writelane_b32 v1, s30, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_writelane_b32 v1, s31, 1 @@ -319,7 +328,7 @@ ; GFX11-NEXT: scratch_load_b32 v1, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: s_mov_b32 s33, s2 +; GFX11-NEXT: s_mov_b32 s33, s3 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: @@ -367,6 +376,7 @@ ; GFX9-NEXT: s_add_u32 s34, s34, return_3xi16@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s35, s35, return_3xi16@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 +; GFX9-NEXT: ; implicit-def: $vgpr2 ; GFX9-NEXT: v_writelane_b32 v2, s30, 0 ; GFX9-NEXT: v_writelane_b32 v2, s31, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -395,6 +405,7 @@ ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, return_3xi16@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, return_3xi16@gotpcrel32@hi+12 +; GFX10-NEXT: ; implicit-def: $vgpr2 ; GFX10-NEXT: v_writelane_b32 v2, s30, 0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_writelane_b32 v2, s31, 1 @@ -415,7 +426,7 @@ ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_mov_b32 s2, s33 +; GFX11-NEXT: s_mov_b32 s3, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill @@ -424,6 +435,7 @@ ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, return_3xi16@gotpcrel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, return_3xi16@gotpcrel32@hi+12 +; GFX11-NEXT: ; implicit-def: $vgpr2 ; GFX11-NEXT: v_writelane_b32 v2, s30, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_writelane_b32 v2, s31, 1 @@ -436,7 +448,7 @@ ; GFX11-NEXT: scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: s_mov_b32 s33, s2 +; GFX11-NEXT: s_mov_b32 s33, s3 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: @@ -1649,8 +1661,9 @@ ; GFX9-NEXT: s_add_u32 s34, s34, return_512xi32@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s35, s35, return_512xi32@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 -; GFX9-NEXT: v_writelane_b32 v2, s30, 0 +; GFX9-NEXT: ; implicit-def: $vgpr2 ; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s33 +; GFX9-NEXT: v_writelane_b32 v2, s30, 0 ; GFX9-NEXT: v_writelane_b32 v2, s31, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -1679,9 +1692,10 @@ ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, return_512xi32@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, return_512xi32@gotpcrel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v2, s30, 0 -; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 +; GFX10-NEXT: ; implicit-def: $vgpr2 ; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33 +; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 +; GFX10-NEXT: v_writelane_b32 v2, s30, 0 ; GFX10-NEXT: v_writelane_b32 v2, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] @@ -1711,9 +1725,10 @@ ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, return_512xi32@gotpcrel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, return_512xi32@gotpcrel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v5, s30, 0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: ; implicit-def: $vgpr5 ; GFX11-NEXT: v_mov_b32_e32 v0, s33 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_writelane_b32 v5, s30, 0 ; GFX11-NEXT: v_writelane_b32 v5, s31, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll --- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll @@ -401,6 +401,7 @@ ; GCN-NEXT: s_mov_b64 exec, s[18:19] ; GCN-NEXT: v_writelane_b32 v41, s16, 0 ; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: ; implicit-def: $vgpr40 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: v_writelane_b32 v40, s34, 2 @@ -487,6 +488,7 @@ ; GISEL-NEXT: s_mov_b64 exec, s[18:19] ; GISEL-NEXT: v_writelane_b32 v41, s16, 0 ; GISEL-NEXT: s_addk_i32 s32, 0x400 +; GISEL-NEXT: ; implicit-def: $vgpr40 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 ; GISEL-NEXT: v_writelane_b32 v40, s31, 1 ; GISEL-NEXT: v_writelane_b32 v40, s34, 2 @@ -577,6 +579,7 @@ ; GCN-NEXT: s_mov_b64 exec, s[18:19] ; GCN-NEXT: v_writelane_b32 v41, s16, 0 ; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: ; implicit-def: $vgpr40 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: v_writelane_b32 v40, s34, 2 @@ -666,6 +669,7 @@ ; GISEL-NEXT: s_mov_b64 exec, s[18:19] ; GISEL-NEXT: v_writelane_b32 v41, s16, 0 ; GISEL-NEXT: s_addk_i32 s32, 0x400 +; GISEL-NEXT: ; implicit-def: $vgpr40 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 ; GISEL-NEXT: v_writelane_b32 v40, s31, 1 ; GISEL-NEXT: v_writelane_b32 v40, s34, 2 @@ -757,6 +761,7 @@ ; GCN-NEXT: s_mov_b64 exec, s[18:19] ; GCN-NEXT: v_writelane_b32 v41, s16, 0 ; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: ; implicit-def: $vgpr40 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: v_writelane_b32 v40, s34, 2 @@ -845,6 +850,7 @@ ; GISEL-NEXT: s_mov_b64 exec, s[18:19] ; GISEL-NEXT: v_writelane_b32 v41, s16, 0 ; GISEL-NEXT: s_addk_i32 s32, 0x400 +; GISEL-NEXT: ; implicit-def: $vgpr40 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 ; GISEL-NEXT: v_writelane_b32 v40, s31, 1 ; GISEL-NEXT: v_writelane_b32 v40, s34, 2 @@ -938,6 +944,7 @@ ; GCN-NEXT: s_mov_b64 exec, s[18:19] ; GCN-NEXT: v_writelane_b32 v41, s16, 0 ; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: ; implicit-def: $vgpr40 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: v_writelane_b32 v40, s34, 2 @@ -1035,6 +1042,7 @@ ; GISEL-NEXT: s_mov_b64 exec, s[18:19] ; GISEL-NEXT: v_writelane_b32 v41, s16, 0 ; GISEL-NEXT: s_addk_i32 s32, 0x400 +; GISEL-NEXT: ; implicit-def: $vgpr40 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 ; GISEL-NEXT: v_writelane_b32 v40, s31, 1 ; GISEL-NEXT: v_writelane_b32 v40, s34, 2 @@ -1141,6 +1149,7 @@ ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: ; implicit-def: $vgpr40 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: v_writelane_b32 v40, s34, 2 @@ -1235,6 +1244,7 @@ ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GISEL-NEXT: s_mov_b64 exec, s[6:7] ; GISEL-NEXT: s_addk_i32 s32, 0x400 +; GISEL-NEXT: ; implicit-def: $vgpr40 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 ; GISEL-NEXT: v_writelane_b32 v40, s31, 1 ; GISEL-NEXT: v_writelane_b32 v40, s34, 2 @@ -1327,196 +1337,198 @@ ; GCN-LABEL: test_indirect_call_vgpr_ptr_arg_and_reuse: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s10, s33 +; GCN-NEXT: s_mov_b32 s12, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0x400 -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_writelane_b32 v40, s30, 0 -; GCN-NEXT: v_writelane_b32 v40, s31, 1 -; GCN-NEXT: v_writelane_b32 v40, s34, 2 -; GCN-NEXT: v_writelane_b32 v40, s35, 3 -; GCN-NEXT: v_writelane_b32 v40, s36, 4 -; GCN-NEXT: v_writelane_b32 v40, s37, 5 -; GCN-NEXT: v_writelane_b32 v40, s38, 6 -; GCN-NEXT: v_writelane_b32 v40, s39, 7 -; GCN-NEXT: v_writelane_b32 v40, s40, 8 -; GCN-NEXT: v_writelane_b32 v40, s41, 9 -; GCN-NEXT: v_writelane_b32 v40, s42, 10 -; GCN-NEXT: v_writelane_b32 v40, s43, 11 -; GCN-NEXT: v_writelane_b32 v40, s44, 12 -; GCN-NEXT: v_writelane_b32 v40, s45, 13 -; GCN-NEXT: v_writelane_b32 v40, s46, 14 -; GCN-NEXT: v_writelane_b32 v40, s47, 15 -; GCN-NEXT: v_writelane_b32 v40, s48, 16 -; GCN-NEXT: v_writelane_b32 v40, s49, 17 -; GCN-NEXT: v_writelane_b32 v40, s50, 18 -; GCN-NEXT: v_writelane_b32 v40, s51, 19 -; GCN-NEXT: v_writelane_b32 v40, s52, 20 -; GCN-NEXT: v_writelane_b32 v40, s53, 21 -; GCN-NEXT: v_writelane_b32 v40, s54, 22 -; GCN-NEXT: v_writelane_b32 v40, s55, 23 -; GCN-NEXT: v_writelane_b32 v40, s56, 24 -; GCN-NEXT: v_writelane_b32 v40, s57, 25 -; GCN-NEXT: v_writelane_b32 v40, s58, 26 -; GCN-NEXT: v_writelane_b32 v40, s59, 27 -; GCN-NEXT: v_writelane_b32 v40, s60, 28 -; GCN-NEXT: v_writelane_b32 v40, s61, 29 -; GCN-NEXT: v_writelane_b32 v40, s62, 30 -; GCN-NEXT: v_writelane_b32 v40, s63, 31 -; GCN-NEXT: v_mov_b32_e32 v41, v0 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: v_writelane_b32 v41, s30, 0 +; GCN-NEXT: v_writelane_b32 v41, s31, 1 +; GCN-NEXT: v_writelane_b32 v41, s34, 2 +; GCN-NEXT: v_writelane_b32 v41, s35, 3 +; GCN-NEXT: v_writelane_b32 v41, s36, 4 +; GCN-NEXT: v_writelane_b32 v41, s37, 5 +; GCN-NEXT: v_writelane_b32 v41, s38, 6 +; GCN-NEXT: v_writelane_b32 v41, s39, 7 +; GCN-NEXT: v_writelane_b32 v41, s40, 8 +; GCN-NEXT: v_writelane_b32 v41, s41, 9 +; GCN-NEXT: v_writelane_b32 v41, s42, 10 +; GCN-NEXT: v_writelane_b32 v41, s43, 11 +; GCN-NEXT: v_writelane_b32 v41, s44, 12 +; GCN-NEXT: v_writelane_b32 v41, s45, 13 +; GCN-NEXT: v_writelane_b32 v41, s46, 14 +; GCN-NEXT: v_writelane_b32 v41, s47, 15 +; GCN-NEXT: v_writelane_b32 v41, s48, 16 +; GCN-NEXT: v_writelane_b32 v41, s49, 17 +; GCN-NEXT: v_writelane_b32 v41, s50, 18 +; GCN-NEXT: v_writelane_b32 v41, s51, 19 +; GCN-NEXT: v_writelane_b32 v41, s52, 20 +; GCN-NEXT: v_writelane_b32 v41, s53, 21 +; GCN-NEXT: v_writelane_b32 v41, s54, 22 +; GCN-NEXT: v_writelane_b32 v41, s55, 23 +; GCN-NEXT: v_writelane_b32 v41, s56, 24 +; GCN-NEXT: v_writelane_b32 v41, s57, 25 +; GCN-NEXT: v_writelane_b32 v41, s58, 26 +; GCN-NEXT: v_writelane_b32 v41, s59, 27 +; GCN-NEXT: v_writelane_b32 v41, s60, 28 +; GCN-NEXT: v_writelane_b32 v41, s61, 29 +; GCN-NEXT: v_writelane_b32 v41, s62, 30 +; GCN-NEXT: v_writelane_b32 v41, s63, 31 +; GCN-NEXT: v_mov_b32_e32 v40, v0 ; GCN-NEXT: s_mov_b64 s[4:5], exec ; GCN-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: v_readfirstlane_b32 s6, v1 ; GCN-NEXT: v_readfirstlane_b32 s7, v2 ; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[6:7], v[1:2] ; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GCN-NEXT: v_mov_b32_e32 v0, v41 +; GCN-NEXT: v_mov_b32_e32 v0, v40 ; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GCN-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GCN-NEXT: s_xor_b64 exec, exec, s[8:9] ; GCN-NEXT: s_cbranch_execnz .LBB7_1 ; GCN-NEXT: ; %bb.2: ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v41 -; GCN-NEXT: v_readlane_b32 s63, v40, 31 -; GCN-NEXT: v_readlane_b32 s62, v40, 30 -; GCN-NEXT: v_readlane_b32 s61, v40, 29 -; GCN-NEXT: v_readlane_b32 s60, v40, 28 -; GCN-NEXT: v_readlane_b32 s59, v40, 27 -; GCN-NEXT: v_readlane_b32 s58, v40, 26 -; GCN-NEXT: v_readlane_b32 s57, v40, 25 -; GCN-NEXT: v_readlane_b32 s56, v40, 24 -; GCN-NEXT: v_readlane_b32 s55, v40, 23 -; GCN-NEXT: v_readlane_b32 s54, v40, 22 -; GCN-NEXT: v_readlane_b32 s53, v40, 21 -; GCN-NEXT: v_readlane_b32 s52, v40, 20 -; GCN-NEXT: v_readlane_b32 s51, v40, 19 -; GCN-NEXT: v_readlane_b32 s50, v40, 18 -; GCN-NEXT: v_readlane_b32 s49, v40, 17 -; GCN-NEXT: v_readlane_b32 s48, v40, 16 -; GCN-NEXT: v_readlane_b32 s47, v40, 15 -; GCN-NEXT: v_readlane_b32 s46, v40, 14 -; GCN-NEXT: v_readlane_b32 s45, v40, 13 -; GCN-NEXT: v_readlane_b32 s44, v40, 12 -; GCN-NEXT: v_readlane_b32 s43, v40, 11 -; GCN-NEXT: v_readlane_b32 s42, v40, 10 -; GCN-NEXT: v_readlane_b32 s41, v40, 9 -; GCN-NEXT: v_readlane_b32 s40, v40, 8 -; GCN-NEXT: v_readlane_b32 s39, v40, 7 -; GCN-NEXT: v_readlane_b32 s38, v40, 6 -; GCN-NEXT: v_readlane_b32 s37, v40, 5 -; GCN-NEXT: v_readlane_b32 s36, v40, 4 -; GCN-NEXT: v_readlane_b32 s35, v40, 3 -; GCN-NEXT: v_readlane_b32 s34, v40, 2 -; GCN-NEXT: v_readlane_b32 s31, v40, 1 -; GCN-NEXT: v_readlane_b32 s30, v40, 0 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: v_mov_b32_e32 v0, v40 +; GCN-NEXT: v_readlane_b32 s63, v41, 31 +; GCN-NEXT: v_readlane_b32 s62, v41, 30 +; GCN-NEXT: v_readlane_b32 s61, v41, 29 +; GCN-NEXT: v_readlane_b32 s60, v41, 28 +; GCN-NEXT: v_readlane_b32 s59, v41, 27 +; GCN-NEXT: v_readlane_b32 s58, v41, 26 +; GCN-NEXT: v_readlane_b32 s57, v41, 25 +; GCN-NEXT: v_readlane_b32 s56, v41, 24 +; GCN-NEXT: v_readlane_b32 s55, v41, 23 +; GCN-NEXT: v_readlane_b32 s54, v41, 22 +; GCN-NEXT: v_readlane_b32 s53, v41, 21 +; GCN-NEXT: v_readlane_b32 s52, v41, 20 +; GCN-NEXT: v_readlane_b32 s51, v41, 19 +; GCN-NEXT: v_readlane_b32 s50, v41, 18 +; GCN-NEXT: v_readlane_b32 s49, v41, 17 +; GCN-NEXT: v_readlane_b32 s48, v41, 16 +; GCN-NEXT: v_readlane_b32 s47, v41, 15 +; GCN-NEXT: v_readlane_b32 s46, v41, 14 +; GCN-NEXT: v_readlane_b32 s45, v41, 13 +; GCN-NEXT: v_readlane_b32 s44, v41, 12 +; GCN-NEXT: v_readlane_b32 s43, v41, 11 +; GCN-NEXT: v_readlane_b32 s42, v41, 10 +; GCN-NEXT: v_readlane_b32 s41, v41, 9 +; GCN-NEXT: v_readlane_b32 s40, v41, 8 +; GCN-NEXT: v_readlane_b32 s39, v41, 7 +; GCN-NEXT: v_readlane_b32 s38, v41, 6 +; GCN-NEXT: v_readlane_b32 s37, v41, 5 +; GCN-NEXT: v_readlane_b32 s36, v41, 4 +; GCN-NEXT: v_readlane_b32 s35, v41, 3 +; GCN-NEXT: v_readlane_b32 s34, v41, 2 +; GCN-NEXT: v_readlane_b32 s31, v41, 1 +; GCN-NEXT: v_readlane_b32 s30, v41, 0 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s10 +; GCN-NEXT: s_mov_b32 s33, s12 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_indirect_call_vgpr_ptr_arg_and_reuse: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s10, s33 +; GISEL-NEXT: s_mov_b32 s12, s33 ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: s_addk_i32 s32, 0x400 -; GISEL-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; GISEL-NEXT: v_writelane_b32 v40, s30, 0 -; GISEL-NEXT: v_writelane_b32 v40, s31, 1 -; GISEL-NEXT: v_writelane_b32 v40, s34, 2 -; GISEL-NEXT: v_writelane_b32 v40, s35, 3 -; GISEL-NEXT: v_writelane_b32 v40, s36, 4 -; GISEL-NEXT: v_writelane_b32 v40, s37, 5 -; GISEL-NEXT: v_writelane_b32 v40, s38, 6 -; GISEL-NEXT: v_writelane_b32 v40, s39, 7 -; GISEL-NEXT: v_writelane_b32 v40, s40, 8 -; GISEL-NEXT: v_writelane_b32 v40, s41, 9 -; GISEL-NEXT: v_writelane_b32 v40, s42, 10 -; GISEL-NEXT: v_writelane_b32 v40, s43, 11 -; GISEL-NEXT: v_writelane_b32 v40, s44, 12 -; GISEL-NEXT: v_writelane_b32 v40, s45, 13 -; GISEL-NEXT: v_writelane_b32 v40, s46, 14 -; GISEL-NEXT: v_writelane_b32 v40, s47, 15 -; GISEL-NEXT: v_writelane_b32 v40, s48, 16 -; GISEL-NEXT: v_writelane_b32 v40, s49, 17 -; GISEL-NEXT: v_writelane_b32 v40, s50, 18 -; GISEL-NEXT: v_writelane_b32 v40, s51, 19 -; GISEL-NEXT: v_writelane_b32 v40, s52, 20 -; GISEL-NEXT: v_writelane_b32 v40, s53, 21 -; GISEL-NEXT: v_writelane_b32 v40, s54, 22 -; GISEL-NEXT: v_writelane_b32 v40, s55, 23 -; GISEL-NEXT: v_writelane_b32 v40, s56, 24 -; GISEL-NEXT: v_writelane_b32 v40, s57, 25 -; GISEL-NEXT: v_writelane_b32 v40, s58, 26 -; GISEL-NEXT: v_writelane_b32 v40, s59, 27 -; GISEL-NEXT: v_writelane_b32 v40, s60, 28 -; GISEL-NEXT: v_writelane_b32 v40, s61, 29 -; GISEL-NEXT: v_writelane_b32 v40, s62, 30 -; GISEL-NEXT: v_writelane_b32 v40, s63, 31 -; GISEL-NEXT: v_mov_b32_e32 v41, v0 +; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GISEL-NEXT: ; implicit-def: $vgpr41 +; GISEL-NEXT: v_writelane_b32 v41, s30, 0 +; GISEL-NEXT: v_writelane_b32 v41, s31, 1 +; GISEL-NEXT: v_writelane_b32 v41, s34, 2 +; GISEL-NEXT: v_writelane_b32 v41, s35, 3 +; GISEL-NEXT: v_writelane_b32 v41, s36, 4 +; GISEL-NEXT: v_writelane_b32 v41, s37, 5 +; GISEL-NEXT: v_writelane_b32 v41, s38, 6 +; GISEL-NEXT: v_writelane_b32 v41, s39, 7 +; GISEL-NEXT: v_writelane_b32 v41, s40, 8 +; GISEL-NEXT: v_writelane_b32 v41, s41, 9 +; GISEL-NEXT: v_writelane_b32 v41, s42, 10 +; GISEL-NEXT: v_writelane_b32 v41, s43, 11 +; GISEL-NEXT: v_writelane_b32 v41, s44, 12 +; GISEL-NEXT: v_writelane_b32 v41, s45, 13 +; GISEL-NEXT: v_writelane_b32 v41, s46, 14 +; GISEL-NEXT: v_writelane_b32 v41, s47, 15 +; GISEL-NEXT: v_writelane_b32 v41, s48, 16 +; GISEL-NEXT: v_writelane_b32 v41, s49, 17 +; GISEL-NEXT: v_writelane_b32 v41, s50, 18 +; GISEL-NEXT: v_writelane_b32 v41, s51, 19 +; GISEL-NEXT: v_writelane_b32 v41, s52, 20 +; GISEL-NEXT: v_writelane_b32 v41, s53, 21 +; GISEL-NEXT: v_writelane_b32 v41, s54, 22 +; GISEL-NEXT: v_writelane_b32 v41, s55, 23 +; GISEL-NEXT: v_writelane_b32 v41, s56, 24 +; GISEL-NEXT: v_writelane_b32 v41, s57, 25 +; GISEL-NEXT: v_writelane_b32 v41, s58, 26 +; GISEL-NEXT: v_writelane_b32 v41, s59, 27 +; GISEL-NEXT: v_writelane_b32 v41, s60, 28 +; GISEL-NEXT: v_writelane_b32 v41, s61, 29 +; GISEL-NEXT: v_writelane_b32 v41, s62, 30 +; GISEL-NEXT: v_writelane_b32 v41, s63, 31 +; GISEL-NEXT: v_mov_b32_e32 v40, v0 ; GISEL-NEXT: s_mov_b64 s[4:5], exec ; GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GISEL-NEXT: v_readfirstlane_b32 s6, v1 ; GISEL-NEXT: v_readfirstlane_b32 s7, v2 ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[6:7], v[1:2] ; GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GISEL-NEXT: v_mov_b32_e32 v0, v41 +; GISEL-NEXT: v_mov_b32_e32 v0, v40 ; GISEL-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GISEL-NEXT: ; implicit-def: $vgpr1 ; GISEL-NEXT: s_xor_b64 exec, exec, s[8:9] ; GISEL-NEXT: s_cbranch_execnz .LBB7_1 ; GISEL-NEXT: ; %bb.2: ; GISEL-NEXT: s_mov_b64 exec, s[4:5] -; GISEL-NEXT: v_mov_b32_e32 v0, v41 -; GISEL-NEXT: v_readlane_b32 s63, v40, 31 -; GISEL-NEXT: v_readlane_b32 s62, v40, 30 -; GISEL-NEXT: v_readlane_b32 s61, v40, 29 -; GISEL-NEXT: v_readlane_b32 s60, v40, 28 -; GISEL-NEXT: v_readlane_b32 s59, v40, 27 -; GISEL-NEXT: v_readlane_b32 s58, v40, 26 -; GISEL-NEXT: v_readlane_b32 s57, v40, 25 -; GISEL-NEXT: v_readlane_b32 s56, v40, 24 -; GISEL-NEXT: v_readlane_b32 s55, v40, 23 -; GISEL-NEXT: v_readlane_b32 s54, v40, 22 -; GISEL-NEXT: v_readlane_b32 s53, v40, 21 -; GISEL-NEXT: v_readlane_b32 s52, v40, 20 -; GISEL-NEXT: v_readlane_b32 s51, v40, 19 -; GISEL-NEXT: v_readlane_b32 s50, v40, 18 -; GISEL-NEXT: v_readlane_b32 s49, v40, 17 -; GISEL-NEXT: v_readlane_b32 s48, v40, 16 -; GISEL-NEXT: v_readlane_b32 s47, v40, 15 -; GISEL-NEXT: v_readlane_b32 s46, v40, 14 -; GISEL-NEXT: v_readlane_b32 s45, v40, 13 -; GISEL-NEXT: v_readlane_b32 s44, v40, 12 -; GISEL-NEXT: v_readlane_b32 s43, v40, 11 -; GISEL-NEXT: v_readlane_b32 s42, v40, 10 -; GISEL-NEXT: v_readlane_b32 s41, v40, 9 -; GISEL-NEXT: v_readlane_b32 s40, v40, 8 -; GISEL-NEXT: v_readlane_b32 s39, v40, 7 -; GISEL-NEXT: v_readlane_b32 s38, v40, 6 -; GISEL-NEXT: v_readlane_b32 s37, v40, 5 -; GISEL-NEXT: v_readlane_b32 s36, v40, 4 -; GISEL-NEXT: v_readlane_b32 s35, v40, 3 -; GISEL-NEXT: v_readlane_b32 s34, v40, 2 -; GISEL-NEXT: v_readlane_b32 s31, v40, 1 -; GISEL-NEXT: v_readlane_b32 s30, v40, 0 -; GISEL-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; GISEL-NEXT: v_mov_b32_e32 v0, v40 +; GISEL-NEXT: v_readlane_b32 s63, v41, 31 +; GISEL-NEXT: v_readlane_b32 s62, v41, 30 +; GISEL-NEXT: v_readlane_b32 s61, v41, 29 +; GISEL-NEXT: v_readlane_b32 s60, v41, 28 +; GISEL-NEXT: v_readlane_b32 s59, v41, 27 +; GISEL-NEXT: v_readlane_b32 s58, v41, 26 +; GISEL-NEXT: v_readlane_b32 s57, v41, 25 +; GISEL-NEXT: v_readlane_b32 s56, v41, 24 +; GISEL-NEXT: v_readlane_b32 s55, v41, 23 +; GISEL-NEXT: v_readlane_b32 s54, v41, 22 +; GISEL-NEXT: v_readlane_b32 s53, v41, 21 +; GISEL-NEXT: v_readlane_b32 s52, v41, 20 +; GISEL-NEXT: v_readlane_b32 s51, v41, 19 +; GISEL-NEXT: v_readlane_b32 s50, v41, 18 +; GISEL-NEXT: v_readlane_b32 s49, v41, 17 +; GISEL-NEXT: v_readlane_b32 s48, v41, 16 +; GISEL-NEXT: v_readlane_b32 s47, v41, 15 +; GISEL-NEXT: v_readlane_b32 s46, v41, 14 +; GISEL-NEXT: v_readlane_b32 s45, v41, 13 +; GISEL-NEXT: v_readlane_b32 s44, v41, 12 +; GISEL-NEXT: v_readlane_b32 s43, v41, 11 +; GISEL-NEXT: v_readlane_b32 s42, v41, 10 +; GISEL-NEXT: v_readlane_b32 s41, v41, 9 +; GISEL-NEXT: v_readlane_b32 s40, v41, 8 +; GISEL-NEXT: v_readlane_b32 s39, v41, 7 +; GISEL-NEXT: v_readlane_b32 s38, v41, 6 +; GISEL-NEXT: v_readlane_b32 s37, v41, 5 +; GISEL-NEXT: v_readlane_b32 s36, v41, 4 +; GISEL-NEXT: v_readlane_b32 s35, v41, 3 +; GISEL-NEXT: v_readlane_b32 s34, v41, 2 +; GISEL-NEXT: v_readlane_b32 s31, v41, 1 +; GISEL-NEXT: v_readlane_b32 s30, v41, 0 +; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 -; GISEL-NEXT: s_mov_b32 s33, s10 +; GISEL-NEXT: s_mov_b32 s33, s12 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void %fptr(i32 %i) @@ -1531,12 +1543,13 @@ ; GCN-LABEL: test_indirect_call_vgpr_ptr_arg_and_return: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s10, s33 +; GCN-NEXT: s_mov_b32 s12, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: ; implicit-def: $vgpr40 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: v_writelane_b32 v40, s34, 2 @@ -1620,19 +1633,20 @@ ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s10 +; GCN-NEXT: s_mov_b32 s33, s12 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_indirect_call_vgpr_ptr_arg_and_return: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s10, s33 +; GISEL-NEXT: s_mov_b32 s12, s33 ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: s_addk_i32 s32, 0x400 +; GISEL-NEXT: ; implicit-def: $vgpr40 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 ; GISEL-NEXT: v_writelane_b32 v40, s31, 1 ; GISEL-NEXT: v_writelane_b32 v40, s34, 2 @@ -1716,7 +1730,7 @@ ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 -; GISEL-NEXT: s_mov_b32 s33, s10 +; GISEL-NEXT: s_mov_b32 s33, s12 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] %ret = call amdgpu_gfx i32 %fptr(i32 %i) @@ -1728,12 +1742,13 @@ ; GCN-LABEL: test_indirect_tail_call_vgpr_ptr: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s10, s33 +; GCN-NEXT: s_mov_b32 s12, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: ; implicit-def: $vgpr40 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: v_writelane_b32 v40, s34, 2 @@ -1814,19 +1829,20 @@ ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s10 +; GCN-NEXT: s_mov_b32 s33, s12 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_indirect_tail_call_vgpr_ptr: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s10, s33 +; GISEL-NEXT: s_mov_b32 s12, s33 ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: s_addk_i32 s32, 0x400 +; GISEL-NEXT: ; implicit-def: $vgpr40 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 ; GISEL-NEXT: v_writelane_b32 v40, s31, 1 ; GISEL-NEXT: v_writelane_b32 v40, s34, 2 @@ -1907,7 +1923,7 @@ ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 -; GISEL-NEXT: s_mov_b32 s33, s10 +; GISEL-NEXT: s_mov_b32 s33, s12 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] tail call amdgpu_gfx void %fptr() diff --git a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll --- a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll @@ -13,17 +13,30 @@ ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; CHECK-NEXT: s_add_u32 s0, s0, s17 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: v_writelane_b32 v40, s16, 0 +; CHECK-NEXT: ; implicit-def: $vgpr3 +; CHECK-NEXT: v_writelane_b32 v3, s16, 0 +; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1 +; CHECK-NEXT: s_add_i32 s12, s33, 0x100200 +; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s12 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[34:35] ; CHECK-NEXT: s_mov_b32 s13, s15 ; CHECK-NEXT: s_mov_b32 s12, s14 -; CHECK-NEXT: v_readlane_b32 s14, v40, 0 +; CHECK-NEXT: v_readlane_b32 s14, v3, 0 ; CHECK-NEXT: s_mov_b64 s[16:17], s[8:9] ; CHECK-NEXT: v_mov_b32_e32 v3, v2 ; CHECK-NEXT: v_mov_b32_e32 v2, v1 ; CHECK-NEXT: v_mov_b32_e32 v1, v0 +; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1 +; CHECK-NEXT: s_add_i32 s8, s33, 0x100200 +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s8 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[34:35] ; CHECK-NEXT: s_load_dword s8, s[16:17], 0x0 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_writelane_b32 v40, s8, 1 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_writelane_b32 v0, s8, 1 +; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1 +; CHECK-NEXT: s_add_i32 s8, s33, 0x100200 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s8 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[34:35] ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def vgpr10 ; CHECK-NEXT: ;;#ASMEND @@ -56,9 +69,14 @@ ; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] +; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1 +; CHECK-NEXT: s_add_i32 s4, s33, 0x100200 +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[34:35] ; CHECK-NEXT: s_add_i32 s4, s33, 0x100100 ; CHECK-NEXT: buffer_load_dword v10, off, s[0:3], s4 ; 4-byte Folded Reload -; CHECK-NEXT: v_readlane_b32 s4, v40, 1 +; CHECK-NEXT: s_waitcnt vmcnt(1) +; CHECK-NEXT: v_readlane_b32 s4, v0, 1 ; CHECK-NEXT: s_mov_b32 s5, 0 ; CHECK-NEXT: s_cmp_eq_u32 s4, s5 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x4000 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -3037,17 +3037,17 @@ ; GCN-HSA-NEXT: s_lshr_b32 s35, s8, 16 ; GCN-HSA-NEXT: s_lshr_b32 s37, s11, 16 ; GCN-HSA-NEXT: s_lshr_b32 s38, s10, 16 -; GCN-HSA-NEXT: s_lshr_b32 s39, s13, 16 -; GCN-HSA-NEXT: s_lshr_b32 s40, s12, 16 -; GCN-HSA-NEXT: s_lshr_b32 s41, s15, 16 -; GCN-HSA-NEXT: s_lshr_b32 s42, s14, 16 +; GCN-HSA-NEXT: s_lshr_b32 s40, s13, 16 +; GCN-HSA-NEXT: s_lshr_b32 s41, s12, 16 +; GCN-HSA-NEXT: s_lshr_b32 s42, s15, 16 +; GCN-HSA-NEXT: s_lshr_b32 s43, s14, 16 ; GCN-HSA-NEXT: s_and_b32 s25, s1, 0xffff ; GCN-HSA-NEXT: s_and_b32 s27, s0, 0xffff ; GCN-HSA-NEXT: s_and_b32 s29, s3, 0xffff ; GCN-HSA-NEXT: s_and_b32 s31, s2, 0xffff ; GCN-HSA-NEXT: s_and_b32 s34, s5, 0xffff ; GCN-HSA-NEXT: s_and_b32 s36, s4, 0xffff -; GCN-HSA-NEXT: s_and_b32 s43, s7, 0xffff +; GCN-HSA-NEXT: s_and_b32 s39, s7, 0xffff ; GCN-HSA-NEXT: s_and_b32 s44, s6, 0xffff ; GCN-HSA-NEXT: s_and_b32 s45, s9, 0xffff ; GCN-HSA-NEXT: s_and_b32 s46, s8, 0xffff @@ -3172,13 +3172,13 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s52 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s18 ; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s42 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s43 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s50 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s51 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s41 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s40 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s42 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s41 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s49 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s39 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s40 ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[20:23] ; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7] ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] @@ -3206,7 +3206,7 @@ ; GCN-HSA-NEXT: s_add_u32 s0, s16, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s44 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s30 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s43 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s39 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s28 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -6181,129 +6181,129 @@ ; GCN-HSA-NEXT: s_lshr_b32 s33, s4, 16 ; GCN-HSA-NEXT: s_lshr_b32 s34, s2, 16 ; GCN-HSA-NEXT: s_lshr_b32 s18, s0, 16 -; GCN-HSA-NEXT: s_and_b32 s35, s0, 0xffff -; GCN-HSA-NEXT: s_and_b32 s2, s2, 0xffff +; GCN-HSA-NEXT: s_and_b32 s0, s0, 0xffff +; GCN-HSA-NEXT: s_and_b32 s35, s2, 0xffff ; GCN-HSA-NEXT: s_and_b32 s4, s4, 0xffff ; GCN-HSA-NEXT: s_and_b32 s6, s6, 0xffff ; GCN-HSA-NEXT: s_and_b32 s8, s8, 0xffff ; GCN-HSA-NEXT: s_and_b32 s10, s10, 0xffff ; GCN-HSA-NEXT: s_and_b32 s12, s12, 0xffff ; GCN-HSA-NEXT: s_and_b32 s14, s14, 0xffff -; GCN-HSA-NEXT: s_and_b32 s36, s1, 0xffff -; GCN-HSA-NEXT: s_and_b32 s3, s3, 0xffff +; GCN-HSA-NEXT: s_and_b32 s1, s1, 0xffff +; GCN-HSA-NEXT: s_and_b32 s36, s3, 0xffff ; GCN-HSA-NEXT: s_and_b32 s5, s5, 0xffff ; GCN-HSA-NEXT: s_and_b32 s7, s7, 0xffff ; GCN-HSA-NEXT: s_and_b32 s9, s9, 0xffff ; GCN-HSA-NEXT: s_and_b32 s11, s11, 0xffff ; GCN-HSA-NEXT: s_and_b32 s13, s13, 0xffff ; GCN-HSA-NEXT: s_and_b32 s15, s15, 0xffff -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xf0 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xd0 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xb0 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x90 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 +; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xf0 +; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xd0 +; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xb0 +; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s16, 0x90 +; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s15 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s26 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70 +; GCN-HSA-NEXT: s_add_u32 s2, s16, 0x70 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s13 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s25 ; GCN-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s11 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s24 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s23 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50 +; GCN-HSA-NEXT: s_add_u32 s2, s16, 0x50 ; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s22 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s16, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s21 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s16, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s36 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s20 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xe0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s36 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xe0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s19 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xc0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xc0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s27 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xa0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xa0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s28 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x80 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s16, 0x80 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s29 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s16, 0x60 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s30 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 64 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s16, 64 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s31 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s16, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s33 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s35 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s34 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s35 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s18 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -6589,17 +6589,17 @@ ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s12, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s10, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s8, 16 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[48:49], s[20:21], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[50:51], s[18:19], 0x100000 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s52, s6, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s54, s4, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s56, s2, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s58, s0, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[50:51], s[20:21], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[52:53], s[18:19], 0x100000 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s54, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s56, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s58, s2, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s60, s0, 16 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[0:1], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[4:5], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[60:61], s[8:9], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[48:49], s[8:9], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[62:63], s[10:11], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[64:65], s[12:13], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[66:67], s[14:15], 0x100000 @@ -6613,12 +6613,12 @@ ; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[4:5], s[4:5], 48 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s16 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s17 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s50 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s51 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s52 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s53 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s12 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s13 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s48 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s49 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s50 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s51 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s2 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s3 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 @@ -6647,10 +6647,10 @@ ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s4 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s5 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:208 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[58:59], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[56:57], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[54:55], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[52:53], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[60:61], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[58:59], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[56:57], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[54:55], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[30:31], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[28:29], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[26:27], 0x100000 @@ -6678,8 +6678,8 @@ ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s65 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s62 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s63 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s60 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s61 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s48 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s49 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s34 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s35 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s24 @@ -6923,123 +6923,123 @@ ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s36, s15 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s38, s13 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s38, s15 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s40, s13 ; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[82:83], s[14:15], 48 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x100000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s40, s11 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s48, s3 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s50, s1 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s64, s2, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s66, s0, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x100000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s42, s11 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s50, s3 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s52, s1 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s66, s2, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s68, s0, 16 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[18:19], s[0:1], 0x100000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x100000 -; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[68:69], s[0:1], 48 +; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[36:37], s[0:1], 48 ; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[70:71], s[2:3], 48 ; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[80:81], s[12:13], 48 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s16 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s17 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x100000 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s36 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s37 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x100000 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s38 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s39 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s82 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s83 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s42, s9 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s44, s9 ; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[78:79], s[10:11], 48 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x100000 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s44, s7 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s38 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s39 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s46, s7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s40 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s41 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s80 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s81 ; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[76:77], s[8:9], 48 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x100000 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s46, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s40 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s41 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s48, s5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s42 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s43 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s78 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s79 ; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[74:75], s[6:7], 48 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x100000 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 ; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[72:73], s[4:5], 48 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s42 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s43 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s44 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s45 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s76 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s77 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x100000 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x100000 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s44 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s45 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x100000 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s46 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s47 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s74 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s75 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s52, s14, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s46 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s47 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s54, s14, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s48 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s49 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s72 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s73 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x100000 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s54, s12, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s48 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s49 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s56, s12, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s50 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s51 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s70 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s71 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[14:15], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x100000 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s56, s10, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s50 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s51 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s68 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s69 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s58, s10, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s52 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s53 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s36 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s37 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[30:31], s[12:13], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[54:55], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[56:57], 0x100000 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s58, s8, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s60, s8, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s34 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s35 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s52 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s53 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s54 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s55 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[28:29], s[10:11], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[56:57], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[58:59], 0x100000 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s60, s6, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s62, s6, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s31 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s17 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[8:9], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[58:59], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[60:61], 0x100000 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s62, s4, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s64, s4, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s29 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s14 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s15 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[6:7], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[60:61], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[62:63], 0x100000 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[22:23], s[4:5], 0x100000 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s27 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s12 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s13 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[62:63], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[64:65], 0x100000 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[64:65], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[66:67], 0x100000 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s25 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s11 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[66:67], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[68:69], 0x100000 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s23 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8 diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll --- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll +++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll @@ -231,10 +231,10 @@ ; W64-O0-DAG: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec ; W64-O0: [[LOOPBB0:.LBB[0-9]+_[0-9]+]]: ; =>This Inner Loop Header: Depth=1 -; W64-O0: buffer_load_dword v[[VRSRC0:[0-9]+]], off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; W64-O0: buffer_load_dword v[[VRSRC1:[0-9]+]], off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; W64-O0: buffer_load_dword v[[VRSRC2:[0-9]+]], off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; W64-O0: buffer_load_dword v[[VRSRC3:[0-9]+]], off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; W64-O0: buffer_load_dword v[[VRSRC0:[0-9]+]], off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; W64-O0: buffer_load_dword v[[VRSRC1:[0-9]+]], off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; W64-O0: buffer_load_dword v[[VRSRC2:[0-9]+]], off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; W64-O0: buffer_load_dword v[[VRSRC3:[0-9]+]], off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; W64-O0: s_waitcnt vmcnt(0) ; W64-O0-DAG: v_readfirstlane_b32 s[[S0:[0-9]+]], v[[VRSRC0]] ; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP1:[0-9]+]], v[[VRSRC1]] @@ -251,7 +251,7 @@ ; W64-O0-DAG: s_mov_b32 s[[S2:[0-9]+]], s[[SRSRCTMP2]] ; W64-O0-DAG: s_mov_b32 s[[S3:[0-9]+]], s[[SRSRCTMP3]] ; W64-O0: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[AND]] -; W64-O0: buffer_load_dword [[IDX:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 ; 4-byte Folded Reload +; W64-O0: buffer_load_dword [[IDX:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 offset:4 ; 4-byte Folded Reload ; W64-O0: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s[[[S0]]:[[S3]]], {{.*}} idxen ; W64-O0: s_waitcnt vmcnt(0) ; W64-O0: buffer_store_dword [[RES]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF_TMP:[0-9]+]] ; 4-byte Folded Spill @@ -264,16 +264,18 @@ ; W64-O0: s_cbranch_execz [[TERMBB:.LBB[0-9]+_[0-9]+]] ; W64-O0: ; %bb.{{[0-9]+}}: ; %bb1 -; W64-O0-DAG: buffer_store_dword {{v[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s32 offset:[[IDX_OFF:[0-9]+]] ; 4-byte Folded Spill +; W64-O0: buffer_load_dword +; W64-O0: buffer_store_dword {{v[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s32 offset:[[IDX_OFF:[0-9]+]] ; 4-byte Folded Spill ; W64-O0-DAG: s_mov_b64 s[[[SAVEEXEC0:[0-9]+]]:[[SAVEEXEC1:[0-9]+]]], exec ; W64-O0: v_writelane_b32 [[VSAVEEXEC:v[0-9]+]], s[[SAVEEXEC0]], [[SAVEEXEC_IDX0:[0-9]+]] ; W64-O0: v_writelane_b32 [[VSAVEEXEC]], s[[SAVEEXEC1]], [[SAVEEXEC_IDX1:[0-9]+]] +; W64-O0: buffer_store_dword [[VSAVEEXEC]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} ; 4-byte Folded Spill ; W64-O0: [[LOOPBB1:.LBB[0-9]+_[0-9]+]]: ; =>This Inner Loop Header: Depth=1 -; W64-O0: buffer_load_dword v[[VRSRC0:[0-9]+]], off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; W64-O0: buffer_load_dword v[[VRSRC1:[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; W64-O0: buffer_load_dword v[[VRSRC2:[0-9]+]], off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; W64-O0: buffer_load_dword v[[VRSRC3:[0-9]+]], off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; W64-O0: buffer_load_dword v[[VRSRC0:[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; W64-O0: buffer_load_dword v[[VRSRC1:[0-9]+]], off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; W64-O0: buffer_load_dword v[[VRSRC2:[0-9]+]], off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; W64-O0: buffer_load_dword v[[VRSRC3:[0-9]+]], off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; W64-O0: s_waitcnt vmcnt(0) ; W64-O0-DAG: v_readfirstlane_b32 s[[S0:[0-9]+]], v[[VRSRC0]] ; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP1:[0-9]+]], v[[VRSRC1]] @@ -290,7 +292,9 @@ ; W64-O0-DAG: s_mov_b32 s[[S2:[0-9]+]], s[[SRSRCTMP2]] ; W64-O0-DAG: s_mov_b32 s[[S3:[0-9]+]], s[[SRSRCTMP3]] ; W64-O0: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[AND]] +; W64-O0: buffer_store_dword ; W64-O0: buffer_load_dword [[IDX:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 offset:[[IDX_OFF]] ; 4-byte Folded Reload +; W64-O0: buffer_load_dword ; W64-O0: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s[[[S0]]:[[S3]]], {{.*}} idxen ; W64-O0: s_waitcnt vmcnt(0) ; W64-O0: buffer_store_dword [[RES]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF_TMP:[0-9]+]] ; 4-byte Folded Spill @@ -298,8 +302,9 @@ ; W64-O0-NEXT: s_cbranch_execnz [[LOOPBB1]] ; W64-O0: buffer_load_dword [[RES:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF_TMP]] ; 4-byte Folded Reload -; W64-O0: v_readlane_b32 s[[SAVEEXEC0:[0-9]+]], [[VSAVEEXEC]], [[SAVEEXEC_IDX0]] -; W64-O0: v_readlane_b32 s[[SAVEEXEC1:[0-9]+]], [[VSAVEEXEC]], [[SAVEEXEC_IDX1]] +; W64-O0: buffer_load_dword [[VSAVEEXEC1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} ; 4-byte Folded Reload +; W64-O0: v_readlane_b32 s[[SAVEEXEC0:[0-9]+]], [[VSAVEEXEC1]], [[SAVEEXEC_IDX0]] +; W64-O0: v_readlane_b32 s[[SAVEEXEC1:[0-9]+]], [[VSAVEEXEC1]], [[SAVEEXEC_IDX1]] ; W64-O0: s_mov_b64 exec, s[[[SAVEEXEC0]]:[[SAVEEXEC1]]] ; W64-O0: buffer_store_dword [[RES]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF]] ; 4-byte Folded Spill diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll --- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll +++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll @@ -191,48 +191,49 @@ ; GFX9-NEXT: s_mov_b32 s4, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: s_addk_i32 s32, 0x800 -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 +; GFX9-NEXT: v_writelane_b32 v42, s30, 0 +; GFX9-NEXT: v_writelane_b32 v42, s31, 1 +; GFX9-NEXT: v_writelane_b32 v42, s34, 2 ; GFX9-NEXT: v_writelane_b32 v44, s4, 0 -; GFX9-NEXT: v_writelane_b32 v40, s36, 3 +; GFX9-NEXT: v_writelane_b32 v42, s36, 3 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, foo@gotpcrel32@hi+12 -; GFX9-NEXT: v_writelane_b32 v40, s37, 4 +; GFX9-NEXT: v_writelane_b32 v42, s37, 4 ; GFX9-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x0 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v41, v1 -; GFX9-NEXT: v_mov_b32_e32 v42, v0 -; GFX9-NEXT: v_mul_u32_u24_e32 v0, v42, v41 +; GFX9-NEXT: v_mov_b32_e32 v40, v1 +; GFX9-NEXT: v_mov_b32_e32 v41, v0 +; GFX9-NEXT: v_mul_u32_u24_e32 v0, v41, v40 ; GFX9-NEXT: s_mov_b32 s34, s15 -; GFX9-NEXT: v_and_b32_e32 v43, 0xffffff, v41 +; GFX9-NEXT: v_and_b32_e32 v43, 0xffffff, v40 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[36:37] -; GFX9-NEXT: v_mad_u32_u24 v41, v42, v41, v43 +; GFX9-NEXT: v_mad_u32_u24 v40, v41, v40, v43 ; GFX9-NEXT: s_mov_b32 s15, s34 -; GFX9-NEXT: v_mov_b32_e32 v0, v41 +; GFX9-NEXT: v_mov_b32_e32 v0, v40 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[36:37] -; GFX9-NEXT: v_add_u32_e32 v0, v41, v43 +; GFX9-NEXT: v_add_u32_e32 v0, v40, v43 ; GFX9-NEXT: s_mov_b32 s15, s34 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[36:37] ; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s37, v40, 4 -; GFX9-NEXT: v_readlane_b32 s36, v40, 3 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s37, v42, 4 +; GFX9-NEXT: v_readlane_b32 s36, v42, 3 +; GFX9-NEXT: v_readlane_b32 s34, v42, 2 +; GFX9-NEXT: v_readlane_b32 s31, v42, 1 +; GFX9-NEXT: v_readlane_b32 s30, v42, 0 ; GFX9-NEXT: v_readlane_b32 s4, v44, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: s_addk_i32 s32, 0xf800 diff --git a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll --- a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll @@ -27,15 +27,19 @@ ; CHECK-LABEL: csr_vgpr_spill_fp_callee: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s6, s33 +; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_add_i32 s32, s32, 0x400 ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; CHECK-NEXT: v_writelane_b32 v1, s30, 0 -; CHECK-NEXT: v_writelane_b32 v1, s31, 1 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: v_writelane_b32 v0, s30, 0 +; CHECK-NEXT: v_writelane_b32 v0, s31, 1 +; CHECK-NEXT: s_or_saveexec_b64 s[12:13], -1 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[12:13] ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, callee_has_fp@rel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, callee_has_fp@rel32@hi+12 @@ -44,17 +48,21 @@ ; CHECK-NEXT: s_mov_b64 s[0:1], s[8:9] ; CHECK-NEXT: s_mov_b64 s[2:3], s[10:11] ; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CHECK-NEXT: s_or_saveexec_b64 s[12:13], -1 +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[12:13] ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; clobber csr v40 ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s31, v1, 1 -; CHECK-NEXT: v_readlane_b32 s30, v1, 0 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_readlane_b32 s31, v0, 1 +; CHECK-NEXT: v_readlane_b32 s30, v0, 0 ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 -; CHECK-NEXT: s_mov_b32 s33, s6 +; CHECK-NEXT: s_mov_b32 s33, s14 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] bb: @@ -91,21 +99,22 @@ ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; CHECK-NEXT: v_writelane_b32 v1, s33, 0 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: v_writelane_b32 v0, s33, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; clobber csr v40 ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, callee_has_fp@rel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, callee_has_fp@rel32@hi+12 -; CHECK-NEXT: v_readlane_b32 s33, v1, 0 +; CHECK-NEXT: v_readlane_b32 s33, v0, 0 ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; CHECK-NEXT: s_xor_saveexec_b64 s[6:7], -1 -; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b64 exec, s[6:7] +; CHECK-NEXT: s_xor_saveexec_b64 s[8:9], -1 +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[8:9] ; CHECK-NEXT: s_setpc_b64 s[4:5] bb: call void asm sideeffect "; clobber csr v40", "~{v40}"() @@ -152,12 +161,13 @@ ; CHECK-LABEL: caller_save_vgpr_spill_fp_tail_call: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s6, s33 +; CHECK-NEXT: s_mov_b32 s12, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_add_i32 s32, s32, 0x400 +; CHECK-NEXT: ; implicit-def: $vgpr1 ; CHECK-NEXT: v_writelane_b32 v1, s30, 0 ; CHECK-NEXT: v_writelane_b32 v1, s31, 1 ; CHECK-NEXT: s_getpc_b64 s[4:5] @@ -174,7 +184,7 @@ ; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 -; CHECK-NEXT: s_mov_b32 s33, s6 +; CHECK-NEXT: s_mov_b32 s33, s12 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -186,14 +196,19 @@ ; CHECK-LABEL: caller_save_vgpr_spill_fp: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s7, s33 +; CHECK-NEXT: s_mov_b32 s13, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_add_i32 s32, s32, 0x400 -; CHECK-NEXT: v_writelane_b32 v2, s30, 0 -; CHECK-NEXT: v_writelane_b32 v2, s31, 1 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: v_writelane_b32 v0, s30, 0 +; CHECK-NEXT: v_writelane_b32 v0, s31, 1 +; CHECK-NEXT: s_or_saveexec_b64 s[14:15], -1 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[14:15] ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, caller_save_vgpr_spill_fp_tail_call@rel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, caller_save_vgpr_spill_fp_tail_call@rel32@hi+12 @@ -202,13 +217,18 @@ ; CHECK-NEXT: s_mov_b64 s[0:1], s[8:9] ; CHECK-NEXT: s_mov_b64 s[2:3], s[10:11] ; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] -; CHECK-NEXT: v_readlane_b32 s31, v2, 1 -; CHECK-NEXT: v_readlane_b32 s30, v2, 0 +; CHECK-NEXT: s_or_saveexec_b64 s[14:15], -1 +; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[14:15] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_readlane_b32 s31, v1, 1 +; CHECK-NEXT: v_readlane_b32 s30, v1, 0 ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 -; CHECK-NEXT: s_mov_b32 s33, s7 +; CHECK-NEXT: s_mov_b32 s33, s13 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: diff --git a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll --- a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll +++ b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll @@ -15,14 +15,21 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_mov_b32 s16, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 -; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 -; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: s_xor_saveexec_b64 s[18:19], -1 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, -1 +; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[18:19] -; CHECK-NEXT: v_writelane_b32 v41, s16, 0 +; CHECK-NEXT: v_writelane_b32 v40, s34, 0 +; CHECK-NEXT: v_writelane_b32 v40, s35, 1 +; CHECK-NEXT: v_writelane_b32 v40, s16, 2 ; CHECK-NEXT: s_add_i32 s32, s32, 0x400 -; CHECK-NEXT: v_writelane_b32 v40, s30, 0 -; CHECK-NEXT: v_writelane_b32 v40, s31, 1 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: v_writelane_b32 v0, s30, 0 +; CHECK-NEXT: v_writelane_b32 v0, s31, 1 +; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[34:35] ; CHECK-NEXT: .Ltmp0: ; CHECK-NEXT: .loc 0 31 3 prologue_end ; lane-info.cpp:31:3 ; CHECK-NEXT: s_getpc_b64 s[16:17] @@ -36,13 +43,20 @@ ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: .Ltmp1: +; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1 +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s33 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[34:35] ; CHECK-NEXT: .loc 0 32 1 ; lane-info.cpp:32:1 -; CHECK-NEXT: v_readlane_b32 s31, v40, 1 -; CHECK-NEXT: v_readlane_b32 s30, v40, 0 -; CHECK-NEXT: v_readlane_b32 s4, v41, 0 -; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 -; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_readlane_b32 s31, v0, 1 +; CHECK-NEXT: v_readlane_b32 s30, v0, 0 +; CHECK-NEXT: v_readlane_b32 s34, v40, 0 +; CHECK-NEXT: v_readlane_b32 s35, v40, 1 +; CHECK-NEXT: v_readlane_b32 s4, v40, 2 +; CHECK-NEXT: s_xor_saveexec_b64 s[6:7], -1 +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, -1 +; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[6:7] ; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 ; CHECK-NEXT: s_mov_b32 s33, s4 diff --git a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll --- a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll @@ -11,10 +11,17 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(i32 addrspace(1)* %out, i32 %in) #0 { ; GCN-LABEL: spill_sgprs_to_multiple_vgprs: ; GCN: ; %bb.0: +; GCN-NEXT: s_mov_b32 s92, SCRATCH_RSRC_DWORD0 +; GCN-NEXT: s_mov_b32 s93, SCRATCH_RSRC_DWORD1 +; GCN-NEXT: s_mov_b32 s94, -1 +; GCN-NEXT: s_mov_b32 s95, 0xe8f000 +; GCN-NEXT: s_add_u32 s92, s92, s3 +; GCN-NEXT: s_addc_u32 s93, s93, 0 ; GCN-NEXT: s_load_dword s0, s[0:1], 0xb ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: v_writelane_b32 v0, s4, 0 ; GCN-NEXT: v_writelane_b32 v0, s5, 1 ; GCN-NEXT: v_writelane_b32 v0, s6, 2 @@ -100,264 +107,285 @@ ; GCN-NEXT: v_writelane_b32 v0, s9, 61 ; GCN-NEXT: v_writelane_b32 v0, s10, 62 ; GCN-NEXT: v_writelane_b32 v0, s11, 63 +; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GCN-NEXT: buffer_store_dword v0, off, s[92:95], 0 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 0 -; GCN-NEXT: v_writelane_b32 v1, s5, 1 -; GCN-NEXT: v_writelane_b32 v1, s6, 2 -; GCN-NEXT: v_writelane_b32 v1, s7, 3 -; GCN-NEXT: v_writelane_b32 v1, s8, 4 -; GCN-NEXT: v_writelane_b32 v1, s9, 5 -; GCN-NEXT: v_writelane_b32 v1, s10, 6 -; GCN-NEXT: v_writelane_b32 v1, s11, 7 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: v_writelane_b32 v0, s4, 0 +; GCN-NEXT: v_writelane_b32 v0, s5, 1 +; GCN-NEXT: v_writelane_b32 v0, s6, 2 +; GCN-NEXT: v_writelane_b32 v0, s7, 3 +; GCN-NEXT: v_writelane_b32 v0, s8, 4 +; GCN-NEXT: v_writelane_b32 v0, s9, 5 +; GCN-NEXT: v_writelane_b32 v0, s10, 6 +; GCN-NEXT: v_writelane_b32 v0, s11, 7 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 8 -; GCN-NEXT: v_writelane_b32 v1, s5, 9 -; GCN-NEXT: v_writelane_b32 v1, s6, 10 -; GCN-NEXT: v_writelane_b32 v1, s7, 11 -; GCN-NEXT: v_writelane_b32 v1, s8, 12 -; GCN-NEXT: v_writelane_b32 v1, s9, 13 -; GCN-NEXT: v_writelane_b32 v1, s10, 14 -; GCN-NEXT: v_writelane_b32 v1, s11, 15 +; GCN-NEXT: v_writelane_b32 v0, s4, 8 +; GCN-NEXT: v_writelane_b32 v0, s5, 9 +; GCN-NEXT: v_writelane_b32 v0, s6, 10 +; GCN-NEXT: v_writelane_b32 v0, s7, 11 +; GCN-NEXT: v_writelane_b32 v0, s8, 12 +; GCN-NEXT: v_writelane_b32 v0, s9, 13 +; GCN-NEXT: v_writelane_b32 v0, s10, 14 +; GCN-NEXT: v_writelane_b32 v0, s11, 15 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 16 -; GCN-NEXT: v_writelane_b32 v1, s5, 17 -; GCN-NEXT: v_writelane_b32 v1, s6, 18 -; GCN-NEXT: v_writelane_b32 v1, s7, 19 -; GCN-NEXT: v_writelane_b32 v1, s8, 20 -; GCN-NEXT: v_writelane_b32 v1, s9, 21 -; GCN-NEXT: v_writelane_b32 v1, s10, 22 -; GCN-NEXT: v_writelane_b32 v1, s11, 23 +; GCN-NEXT: v_writelane_b32 v0, s4, 16 +; GCN-NEXT: v_writelane_b32 v0, s5, 17 +; GCN-NEXT: v_writelane_b32 v0, s6, 18 +; GCN-NEXT: v_writelane_b32 v0, s7, 19 +; GCN-NEXT: v_writelane_b32 v0, s8, 20 +; GCN-NEXT: v_writelane_b32 v0, s9, 21 +; GCN-NEXT: v_writelane_b32 v0, s10, 22 +; GCN-NEXT: v_writelane_b32 v0, s11, 23 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 24 -; GCN-NEXT: v_writelane_b32 v1, s5, 25 -; GCN-NEXT: v_writelane_b32 v1, s6, 26 -; GCN-NEXT: v_writelane_b32 v1, s7, 27 -; GCN-NEXT: v_writelane_b32 v1, s8, 28 -; GCN-NEXT: v_writelane_b32 v1, s9, 29 -; GCN-NEXT: v_writelane_b32 v1, s10, 30 -; GCN-NEXT: v_writelane_b32 v1, s11, 31 +; GCN-NEXT: v_writelane_b32 v0, s4, 24 +; GCN-NEXT: v_writelane_b32 v0, s5, 25 +; GCN-NEXT: v_writelane_b32 v0, s6, 26 +; GCN-NEXT: v_writelane_b32 v0, s7, 27 +; GCN-NEXT: v_writelane_b32 v0, s8, 28 +; GCN-NEXT: v_writelane_b32 v0, s9, 29 +; GCN-NEXT: v_writelane_b32 v0, s10, 30 +; GCN-NEXT: v_writelane_b32 v0, s11, 31 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 32 -; GCN-NEXT: v_writelane_b32 v1, s5, 33 -; GCN-NEXT: v_writelane_b32 v1, s6, 34 -; GCN-NEXT: v_writelane_b32 v1, s7, 35 -; GCN-NEXT: v_writelane_b32 v1, s8, 36 -; GCN-NEXT: v_writelane_b32 v1, s9, 37 -; GCN-NEXT: v_writelane_b32 v1, s10, 38 -; GCN-NEXT: v_writelane_b32 v1, s11, 39 +; GCN-NEXT: v_writelane_b32 v0, s4, 32 +; GCN-NEXT: v_writelane_b32 v0, s5, 33 +; GCN-NEXT: v_writelane_b32 v0, s6, 34 +; GCN-NEXT: v_writelane_b32 v0, s7, 35 +; GCN-NEXT: v_writelane_b32 v0, s8, 36 +; GCN-NEXT: v_writelane_b32 v0, s9, 37 +; GCN-NEXT: v_writelane_b32 v0, s10, 38 +; GCN-NEXT: v_writelane_b32 v0, s11, 39 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 40 -; GCN-NEXT: v_writelane_b32 v1, s5, 41 -; GCN-NEXT: v_writelane_b32 v1, s6, 42 -; GCN-NEXT: v_writelane_b32 v1, s7, 43 -; GCN-NEXT: v_writelane_b32 v1, s8, 44 -; GCN-NEXT: v_writelane_b32 v1, s9, 45 -; GCN-NEXT: v_writelane_b32 v1, s10, 46 -; GCN-NEXT: v_writelane_b32 v1, s11, 47 +; GCN-NEXT: v_writelane_b32 v0, s4, 40 +; GCN-NEXT: v_writelane_b32 v0, s5, 41 +; GCN-NEXT: v_writelane_b32 v0, s6, 42 +; GCN-NEXT: v_writelane_b32 v0, s7, 43 +; GCN-NEXT: v_writelane_b32 v0, s8, 44 +; GCN-NEXT: v_writelane_b32 v0, s9, 45 +; GCN-NEXT: v_writelane_b32 v0, s10, 46 +; GCN-NEXT: v_writelane_b32 v0, s11, 47 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 48 -; GCN-NEXT: v_writelane_b32 v1, s5, 49 -; GCN-NEXT: v_writelane_b32 v1, s6, 50 -; GCN-NEXT: v_writelane_b32 v1, s7, 51 -; GCN-NEXT: v_writelane_b32 v1, s8, 52 -; GCN-NEXT: v_writelane_b32 v1, s9, 53 -; GCN-NEXT: v_writelane_b32 v1, s10, 54 -; GCN-NEXT: v_writelane_b32 v1, s11, 55 +; GCN-NEXT: v_writelane_b32 v0, s4, 48 +; GCN-NEXT: v_writelane_b32 v0, s5, 49 +; GCN-NEXT: v_writelane_b32 v0, s6, 50 +; GCN-NEXT: v_writelane_b32 v0, s7, 51 +; GCN-NEXT: v_writelane_b32 v0, s8, 52 +; GCN-NEXT: v_writelane_b32 v0, s9, 53 +; GCN-NEXT: v_writelane_b32 v0, s10, 54 +; GCN-NEXT: v_writelane_b32 v0, s11, 55 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 56 -; GCN-NEXT: v_writelane_b32 v1, s5, 57 -; GCN-NEXT: v_writelane_b32 v1, s6, 58 -; GCN-NEXT: v_writelane_b32 v1, s7, 59 -; GCN-NEXT: v_writelane_b32 v1, s8, 60 -; GCN-NEXT: v_writelane_b32 v1, s9, 61 -; GCN-NEXT: v_writelane_b32 v1, s10, 62 -; GCN-NEXT: v_writelane_b32 v1, s11, 63 +; GCN-NEXT: v_writelane_b32 v0, s4, 56 +; GCN-NEXT: v_writelane_b32 v0, s5, 57 +; GCN-NEXT: v_writelane_b32 v0, s6, 58 +; GCN-NEXT: v_writelane_b32 v0, s7, 59 +; GCN-NEXT: v_writelane_b32 v0, s8, 60 +; GCN-NEXT: v_writelane_b32 v0, s9, 61 +; GCN-NEXT: v_writelane_b32 v0, s10, 62 +; GCN-NEXT: v_writelane_b32 v0, s11, 63 +; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GCN-NEXT: buffer_store_dword v0, off, s[92:95], 0 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v2, s4, 0 -; GCN-NEXT: v_writelane_b32 v2, s5, 1 -; GCN-NEXT: v_writelane_b32 v2, s6, 2 -; GCN-NEXT: v_writelane_b32 v2, s7, 3 -; GCN-NEXT: v_writelane_b32 v2, s8, 4 -; GCN-NEXT: v_writelane_b32 v2, s9, 5 -; GCN-NEXT: v_writelane_b32 v2, s10, 6 -; GCN-NEXT: v_writelane_b32 v2, s11, 7 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: v_writelane_b32 v0, s4, 0 +; GCN-NEXT: v_writelane_b32 v0, s5, 1 +; GCN-NEXT: v_writelane_b32 v0, s6, 2 +; GCN-NEXT: v_writelane_b32 v0, s7, 3 +; GCN-NEXT: v_writelane_b32 v0, s8, 4 +; GCN-NEXT: v_writelane_b32 v0, s9, 5 +; GCN-NEXT: v_writelane_b32 v0, s10, 6 +; GCN-NEXT: v_writelane_b32 v0, s11, 7 +; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GCN-NEXT: buffer_store_dword v0, off, s[92:95], 0 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s0, s1 ; GCN-NEXT: s_cbranch_scc1 .LBB0_2 ; GCN-NEXT: ; %bb.1: ; %bb0 -; GCN-NEXT: v_readlane_b32 s8, v1, 56 -; GCN-NEXT: v_readlane_b32 s9, v1, 57 -; GCN-NEXT: v_readlane_b32 s10, v1, 58 -; GCN-NEXT: v_readlane_b32 s11, v1, 59 -; GCN-NEXT: v_readlane_b32 s12, v1, 60 -; GCN-NEXT: v_readlane_b32 s13, v1, 61 -; GCN-NEXT: v_readlane_b32 s14, v1, 62 -; GCN-NEXT: v_readlane_b32 s15, v1, 63 -; GCN-NEXT: v_readlane_b32 s16, v1, 48 -; GCN-NEXT: v_readlane_b32 s17, v1, 49 -; GCN-NEXT: v_readlane_b32 s18, v1, 50 -; GCN-NEXT: v_readlane_b32 s19, v1, 51 -; GCN-NEXT: v_readlane_b32 s20, v1, 52 -; GCN-NEXT: v_readlane_b32 s21, v1, 53 -; GCN-NEXT: v_readlane_b32 s22, v1, 54 -; GCN-NEXT: v_readlane_b32 s23, v1, 55 -; GCN-NEXT: v_readlane_b32 s24, v1, 40 -; GCN-NEXT: v_readlane_b32 s25, v1, 41 -; GCN-NEXT: v_readlane_b32 s26, v1, 42 -; GCN-NEXT: v_readlane_b32 s27, v1, 43 -; GCN-NEXT: v_readlane_b32 s28, v1, 44 -; GCN-NEXT: v_readlane_b32 s29, v1, 45 -; GCN-NEXT: v_readlane_b32 s30, v1, 46 -; GCN-NEXT: v_readlane_b32 s31, v1, 47 -; GCN-NEXT: v_readlane_b32 s36, v1, 32 -; GCN-NEXT: v_readlane_b32 s37, v1, 33 -; GCN-NEXT: v_readlane_b32 s38, v1, 34 -; GCN-NEXT: v_readlane_b32 s39, v1, 35 -; GCN-NEXT: v_readlane_b32 s40, v1, 36 -; GCN-NEXT: v_readlane_b32 s41, v1, 37 -; GCN-NEXT: v_readlane_b32 s42, v1, 38 -; GCN-NEXT: v_readlane_b32 s43, v1, 39 -; GCN-NEXT: v_readlane_b32 s44, v1, 24 -; GCN-NEXT: v_readlane_b32 s45, v1, 25 -; GCN-NEXT: v_readlane_b32 s46, v1, 26 -; GCN-NEXT: v_readlane_b32 s47, v1, 27 -; GCN-NEXT: v_readlane_b32 s48, v1, 28 -; GCN-NEXT: v_readlane_b32 s49, v1, 29 -; GCN-NEXT: v_readlane_b32 s50, v1, 30 -; GCN-NEXT: v_readlane_b32 s51, v1, 31 -; GCN-NEXT: v_readlane_b32 s52, v1, 16 -; GCN-NEXT: v_readlane_b32 s53, v1, 17 -; GCN-NEXT: v_readlane_b32 s54, v1, 18 -; GCN-NEXT: v_readlane_b32 s55, v1, 19 -; GCN-NEXT: v_readlane_b32 s56, v1, 20 -; GCN-NEXT: v_readlane_b32 s57, v1, 21 -; GCN-NEXT: v_readlane_b32 s58, v1, 22 -; GCN-NEXT: v_readlane_b32 s59, v1, 23 -; GCN-NEXT: v_readlane_b32 s60, v1, 8 -; GCN-NEXT: v_readlane_b32 s61, v1, 9 -; GCN-NEXT: v_readlane_b32 s62, v1, 10 -; GCN-NEXT: v_readlane_b32 s63, v1, 11 -; GCN-NEXT: v_readlane_b32 s64, v1, 12 -; GCN-NEXT: v_readlane_b32 s65, v1, 13 -; GCN-NEXT: v_readlane_b32 s66, v1, 14 -; GCN-NEXT: v_readlane_b32 s67, v1, 15 -; GCN-NEXT: v_readlane_b32 s68, v1, 0 -; GCN-NEXT: v_readlane_b32 s69, v1, 1 -; GCN-NEXT: v_readlane_b32 s70, v1, 2 -; GCN-NEXT: v_readlane_b32 s71, v1, 3 -; GCN-NEXT: v_readlane_b32 s72, v1, 4 -; GCN-NEXT: v_readlane_b32 s73, v1, 5 -; GCN-NEXT: v_readlane_b32 s74, v1, 6 -; GCN-NEXT: v_readlane_b32 s75, v1, 7 -; GCN-NEXT: v_readlane_b32 s76, v0, 56 -; GCN-NEXT: v_readlane_b32 s77, v0, 57 -; GCN-NEXT: v_readlane_b32 s78, v0, 58 -; GCN-NEXT: v_readlane_b32 s79, v0, 59 -; GCN-NEXT: v_readlane_b32 s80, v0, 60 -; GCN-NEXT: v_readlane_b32 s81, v0, 61 -; GCN-NEXT: v_readlane_b32 s82, v0, 62 -; GCN-NEXT: v_readlane_b32 s83, v0, 63 -; GCN-NEXT: v_readlane_b32 s84, v0, 48 -; GCN-NEXT: v_readlane_b32 s85, v0, 49 -; GCN-NEXT: v_readlane_b32 s86, v0, 50 -; GCN-NEXT: v_readlane_b32 s87, v0, 51 -; GCN-NEXT: v_readlane_b32 s88, v0, 52 -; GCN-NEXT: v_readlane_b32 s89, v0, 53 -; GCN-NEXT: v_readlane_b32 s90, v0, 54 -; GCN-NEXT: v_readlane_b32 s91, v0, 55 -; GCN-NEXT: v_readlane_b32 s0, v0, 0 -; GCN-NEXT: v_readlane_b32 s1, v0, 1 -; GCN-NEXT: v_readlane_b32 s2, v0, 2 -; GCN-NEXT: v_readlane_b32 s3, v0, 3 -; GCN-NEXT: v_readlane_b32 s4, v0, 4 -; GCN-NEXT: v_readlane_b32 s5, v0, 5 -; GCN-NEXT: v_readlane_b32 s6, v0, 6 -; GCN-NEXT: v_readlane_b32 s7, v0, 7 +; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GCN-NEXT: buffer_load_dword v0, off, s[92:95], 0 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[34:35] +; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GCN-NEXT: buffer_load_dword v1, off, s[92:95], 0 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[34:35] +; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GCN-NEXT: buffer_load_dword v2, off, s[92:95], 0 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[34:35] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readlane_b32 s8, v2, 56 +; GCN-NEXT: v_readlane_b32 s9, v2, 57 +; GCN-NEXT: v_readlane_b32 s10, v2, 58 +; GCN-NEXT: v_readlane_b32 s11, v2, 59 +; GCN-NEXT: v_readlane_b32 s12, v2, 60 +; GCN-NEXT: v_readlane_b32 s13, v2, 61 +; GCN-NEXT: v_readlane_b32 s14, v2, 62 +; GCN-NEXT: v_readlane_b32 s15, v2, 63 +; GCN-NEXT: v_readlane_b32 s16, v2, 48 +; GCN-NEXT: v_readlane_b32 s17, v2, 49 +; GCN-NEXT: v_readlane_b32 s18, v2, 50 +; GCN-NEXT: v_readlane_b32 s19, v2, 51 +; GCN-NEXT: v_readlane_b32 s20, v2, 52 +; GCN-NEXT: v_readlane_b32 s21, v2, 53 +; GCN-NEXT: v_readlane_b32 s22, v2, 54 +; GCN-NEXT: v_readlane_b32 s23, v2, 55 +; GCN-NEXT: v_readlane_b32 s24, v2, 40 +; GCN-NEXT: v_readlane_b32 s25, v2, 41 +; GCN-NEXT: v_readlane_b32 s26, v2, 42 +; GCN-NEXT: v_readlane_b32 s27, v2, 43 +; GCN-NEXT: v_readlane_b32 s28, v2, 44 +; GCN-NEXT: v_readlane_b32 s29, v2, 45 +; GCN-NEXT: v_readlane_b32 s30, v2, 46 +; GCN-NEXT: v_readlane_b32 s31, v2, 47 +; GCN-NEXT: v_readlane_b32 s36, v2, 32 +; GCN-NEXT: v_readlane_b32 s37, v2, 33 +; GCN-NEXT: v_readlane_b32 s38, v2, 34 +; GCN-NEXT: v_readlane_b32 s39, v2, 35 +; GCN-NEXT: v_readlane_b32 s40, v2, 36 +; GCN-NEXT: v_readlane_b32 s41, v2, 37 +; GCN-NEXT: v_readlane_b32 s42, v2, 38 +; GCN-NEXT: v_readlane_b32 s43, v2, 39 +; GCN-NEXT: v_readlane_b32 s44, v2, 24 +; GCN-NEXT: v_readlane_b32 s45, v2, 25 +; GCN-NEXT: v_readlane_b32 s46, v2, 26 +; GCN-NEXT: v_readlane_b32 s47, v2, 27 +; GCN-NEXT: v_readlane_b32 s48, v2, 28 +; GCN-NEXT: v_readlane_b32 s49, v2, 29 +; GCN-NEXT: v_readlane_b32 s50, v2, 30 +; GCN-NEXT: v_readlane_b32 s51, v2, 31 +; GCN-NEXT: v_readlane_b32 s52, v2, 16 +; GCN-NEXT: v_readlane_b32 s53, v2, 17 +; GCN-NEXT: v_readlane_b32 s54, v2, 18 +; GCN-NEXT: v_readlane_b32 s55, v2, 19 +; GCN-NEXT: v_readlane_b32 s56, v2, 20 +; GCN-NEXT: v_readlane_b32 s57, v2, 21 +; GCN-NEXT: v_readlane_b32 s58, v2, 22 +; GCN-NEXT: v_readlane_b32 s59, v2, 23 +; GCN-NEXT: v_readlane_b32 s60, v2, 8 +; GCN-NEXT: v_readlane_b32 s61, v2, 9 +; GCN-NEXT: v_readlane_b32 s62, v2, 10 +; GCN-NEXT: v_readlane_b32 s63, v2, 11 +; GCN-NEXT: v_readlane_b32 s64, v2, 12 +; GCN-NEXT: v_readlane_b32 s65, v2, 13 +; GCN-NEXT: v_readlane_b32 s66, v2, 14 +; GCN-NEXT: v_readlane_b32 s67, v2, 15 +; GCN-NEXT: v_readlane_b32 s68, v2, 0 +; GCN-NEXT: v_readlane_b32 s69, v2, 1 +; GCN-NEXT: v_readlane_b32 s70, v2, 2 +; GCN-NEXT: v_readlane_b32 s71, v2, 3 +; GCN-NEXT: v_readlane_b32 s72, v2, 4 +; GCN-NEXT: v_readlane_b32 s73, v2, 5 +; GCN-NEXT: v_readlane_b32 s74, v2, 6 +; GCN-NEXT: v_readlane_b32 s75, v2, 7 +; GCN-NEXT: v_readlane_b32 s76, v1, 56 +; GCN-NEXT: v_readlane_b32 s77, v1, 57 +; GCN-NEXT: v_readlane_b32 s78, v1, 58 +; GCN-NEXT: v_readlane_b32 s79, v1, 59 +; GCN-NEXT: v_readlane_b32 s80, v1, 60 +; GCN-NEXT: v_readlane_b32 s81, v1, 61 +; GCN-NEXT: v_readlane_b32 s82, v1, 62 +; GCN-NEXT: v_readlane_b32 s83, v1, 63 +; GCN-NEXT: v_readlane_b32 s84, v1, 48 +; GCN-NEXT: v_readlane_b32 s85, v1, 49 +; GCN-NEXT: v_readlane_b32 s86, v1, 50 +; GCN-NEXT: v_readlane_b32 s87, v1, 51 +; GCN-NEXT: v_readlane_b32 s88, v1, 52 +; GCN-NEXT: v_readlane_b32 s89, v1, 53 +; GCN-NEXT: v_readlane_b32 s90, v1, 54 +; GCN-NEXT: v_readlane_b32 s91, v1, 55 +; GCN-NEXT: v_readlane_b32 s0, v1, 0 +; GCN-NEXT: v_readlane_b32 s1, v1, 1 +; GCN-NEXT: v_readlane_b32 s2, v1, 2 +; GCN-NEXT: v_readlane_b32 s3, v1, 3 +; GCN-NEXT: v_readlane_b32 s4, v1, 4 +; GCN-NEXT: v_readlane_b32 s5, v1, 5 +; GCN-NEXT: v_readlane_b32 s6, v1, 6 +; GCN-NEXT: v_readlane_b32 s7, v1, 7 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:7] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s0, v0, 8 -; GCN-NEXT: v_readlane_b32 s1, v0, 9 -; GCN-NEXT: v_readlane_b32 s2, v0, 10 -; GCN-NEXT: v_readlane_b32 s3, v0, 11 -; GCN-NEXT: v_readlane_b32 s4, v0, 12 -; GCN-NEXT: v_readlane_b32 s5, v0, 13 -; GCN-NEXT: v_readlane_b32 s6, v0, 14 -; GCN-NEXT: v_readlane_b32 s7, v0, 15 +; GCN-NEXT: v_readlane_b32 s0, v1, 8 +; GCN-NEXT: v_readlane_b32 s1, v1, 9 +; GCN-NEXT: v_readlane_b32 s2, v1, 10 +; GCN-NEXT: v_readlane_b32 s3, v1, 11 +; GCN-NEXT: v_readlane_b32 s4, v1, 12 +; GCN-NEXT: v_readlane_b32 s5, v1, 13 +; GCN-NEXT: v_readlane_b32 s6, v1, 14 +; GCN-NEXT: v_readlane_b32 s7, v1, 15 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:7] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s0, v0, 16 -; GCN-NEXT: v_readlane_b32 s1, v0, 17 -; GCN-NEXT: v_readlane_b32 s2, v0, 18 -; GCN-NEXT: v_readlane_b32 s3, v0, 19 -; GCN-NEXT: v_readlane_b32 s4, v0, 20 -; GCN-NEXT: v_readlane_b32 s5, v0, 21 -; GCN-NEXT: v_readlane_b32 s6, v0, 22 -; GCN-NEXT: v_readlane_b32 s7, v0, 23 +; GCN-NEXT: v_readlane_b32 s0, v1, 16 +; GCN-NEXT: v_readlane_b32 s1, v1, 17 +; GCN-NEXT: v_readlane_b32 s2, v1, 18 +; GCN-NEXT: v_readlane_b32 s3, v1, 19 +; GCN-NEXT: v_readlane_b32 s4, v1, 20 +; GCN-NEXT: v_readlane_b32 s5, v1, 21 +; GCN-NEXT: v_readlane_b32 s6, v1, 22 +; GCN-NEXT: v_readlane_b32 s7, v1, 23 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:7] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s0, v0, 24 -; GCN-NEXT: v_readlane_b32 s1, v0, 25 -; GCN-NEXT: v_readlane_b32 s2, v0, 26 -; GCN-NEXT: v_readlane_b32 s3, v0, 27 -; GCN-NEXT: v_readlane_b32 s4, v0, 28 -; GCN-NEXT: v_readlane_b32 s5, v0, 29 -; GCN-NEXT: v_readlane_b32 s6, v0, 30 -; GCN-NEXT: v_readlane_b32 s7, v0, 31 +; GCN-NEXT: v_readlane_b32 s0, v1, 24 +; GCN-NEXT: v_readlane_b32 s1, v1, 25 +; GCN-NEXT: v_readlane_b32 s2, v1, 26 +; GCN-NEXT: v_readlane_b32 s3, v1, 27 +; GCN-NEXT: v_readlane_b32 s4, v1, 28 +; GCN-NEXT: v_readlane_b32 s5, v1, 29 +; GCN-NEXT: v_readlane_b32 s6, v1, 30 +; GCN-NEXT: v_readlane_b32 s7, v1, 31 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:7] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s0, v0, 32 -; GCN-NEXT: v_readlane_b32 s1, v0, 33 -; GCN-NEXT: v_readlane_b32 s2, v0, 34 -; GCN-NEXT: v_readlane_b32 s3, v0, 35 -; GCN-NEXT: v_readlane_b32 s4, v0, 36 -; GCN-NEXT: v_readlane_b32 s5, v0, 37 -; GCN-NEXT: v_readlane_b32 s6, v0, 38 -; GCN-NEXT: v_readlane_b32 s7, v0, 39 +; GCN-NEXT: v_readlane_b32 s0, v1, 32 +; GCN-NEXT: v_readlane_b32 s1, v1, 33 +; GCN-NEXT: v_readlane_b32 s2, v1, 34 +; GCN-NEXT: v_readlane_b32 s3, v1, 35 +; GCN-NEXT: v_readlane_b32 s4, v1, 36 +; GCN-NEXT: v_readlane_b32 s5, v1, 37 +; GCN-NEXT: v_readlane_b32 s6, v1, 38 +; GCN-NEXT: v_readlane_b32 s7, v1, 39 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:7] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s0, v0, 40 -; GCN-NEXT: v_readlane_b32 s1, v0, 41 -; GCN-NEXT: v_readlane_b32 s2, v0, 42 -; GCN-NEXT: v_readlane_b32 s3, v0, 43 -; GCN-NEXT: v_readlane_b32 s4, v0, 44 -; GCN-NEXT: v_readlane_b32 s5, v0, 45 -; GCN-NEXT: v_readlane_b32 s6, v0, 46 -; GCN-NEXT: v_readlane_b32 s7, v0, 47 +; GCN-NEXT: v_readlane_b32 s0, v1, 40 +; GCN-NEXT: v_readlane_b32 s1, v1, 41 +; GCN-NEXT: v_readlane_b32 s2, v1, 42 +; GCN-NEXT: v_readlane_b32 s3, v1, 43 +; GCN-NEXT: v_readlane_b32 s4, v1, 44 +; GCN-NEXT: v_readlane_b32 s5, v1, 45 +; GCN-NEXT: v_readlane_b32 s6, v1, 46 +; GCN-NEXT: v_readlane_b32 s7, v1, 47 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:7] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s0, v2, 0 -; GCN-NEXT: v_readlane_b32 s1, v2, 1 -; GCN-NEXT: v_readlane_b32 s2, v2, 2 -; GCN-NEXT: v_readlane_b32 s3, v2, 3 -; GCN-NEXT: v_readlane_b32 s4, v2, 4 -; GCN-NEXT: v_readlane_b32 s5, v2, 5 -; GCN-NEXT: v_readlane_b32 s6, v2, 6 -; GCN-NEXT: v_readlane_b32 s7, v2, 7 +; GCN-NEXT: v_readlane_b32 s0, v0, 0 +; GCN-NEXT: v_readlane_b32 s1, v0, 1 +; GCN-NEXT: v_readlane_b32 s2, v0, 2 +; GCN-NEXT: v_readlane_b32 s3, v0, 3 +; GCN-NEXT: v_readlane_b32 s4, v0, 4 +; GCN-NEXT: v_readlane_b32 s5, v0, 5 +; GCN-NEXT: v_readlane_b32 s6, v0, 6 +; GCN-NEXT: v_readlane_b32 s7, v0, 7 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[84:91] ; GCN-NEXT: ;;#ASMEND @@ -442,10 +470,17 @@ define amdgpu_kernel void @split_sgpr_spill_2_vgprs(i32 addrspace(1)* %out, i32 %in) #1 { ; GCN-LABEL: split_sgpr_spill_2_vgprs: ; GCN: ; %bb.0: +; GCN-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0 +; GCN-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1 +; GCN-NEXT: s_mov_b32 s54, -1 +; GCN-NEXT: s_mov_b32 s55, 0xe8f000 +; GCN-NEXT: s_add_u32 s52, s52, s3 +; GCN-NEXT: s_addc_u32 s53, s53, 0 ; GCN-NEXT: s_load_dword s0, s[0:1], 0xb ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: v_writelane_b32 v0, s4, 0 ; GCN-NEXT: v_writelane_b32 v0, s5, 1 ; GCN-NEXT: v_writelane_b32 v0, s6, 2 @@ -519,27 +554,41 @@ ; GCN-NEXT: v_writelane_b32 v0, s17, 61 ; GCN-NEXT: v_writelane_b32 v0, s18, 62 ; GCN-NEXT: v_writelane_b32 v0, s19, 63 +; GCN-NEXT: s_or_saveexec_b64 s[28:29], -1 +; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[28:29] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 0 -; GCN-NEXT: v_writelane_b32 v1, s5, 1 -; GCN-NEXT: v_writelane_b32 v1, s6, 2 -; GCN-NEXT: v_writelane_b32 v1, s7, 3 -; GCN-NEXT: v_writelane_b32 v1, s8, 4 -; GCN-NEXT: v_writelane_b32 v1, s9, 5 -; GCN-NEXT: v_writelane_b32 v1, s10, 6 -; GCN-NEXT: v_writelane_b32 v1, s11, 7 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: v_writelane_b32 v0, s4, 0 +; GCN-NEXT: v_writelane_b32 v0, s5, 1 +; GCN-NEXT: v_writelane_b32 v0, s6, 2 +; GCN-NEXT: v_writelane_b32 v0, s7, 3 +; GCN-NEXT: v_writelane_b32 v0, s8, 4 +; GCN-NEXT: v_writelane_b32 v0, s9, 5 +; GCN-NEXT: v_writelane_b32 v0, s10, 6 +; GCN-NEXT: v_writelane_b32 v0, s11, 7 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[2:3] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s2, 8 -; GCN-NEXT: v_writelane_b32 v1, s3, 9 +; GCN-NEXT: v_writelane_b32 v0, s2, 8 +; GCN-NEXT: v_writelane_b32 v0, s3, 9 +; GCN-NEXT: s_or_saveexec_b64 s[28:29], -1 +; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[28:29] ; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s0, s1 ; GCN-NEXT: s_cbranch_scc1 .LBB1_2 ; GCN-NEXT: ; %bb.1: ; %bb0 +; GCN-NEXT: s_or_saveexec_b64 s[28:29], -1 +; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[28:29] +; GCN-NEXT: s_or_saveexec_b64 s[28:29], -1 +; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[28:29] +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readlane_b32 s16, v1, 8 ; GCN-NEXT: v_readlane_b32 s17, v1, 9 ; GCN-NEXT: v_readlane_b32 s20, v1, 0 @@ -685,176 +734,176 @@ ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v31, s4, 0 -; GCN-NEXT: v_writelane_b32 v31, s5, 1 -; GCN-NEXT: v_writelane_b32 v31, s6, 2 -; GCN-NEXT: v_writelane_b32 v31, s7, 3 -; GCN-NEXT: v_writelane_b32 v31, s8, 4 -; GCN-NEXT: v_writelane_b32 v31, s9, 5 -; GCN-NEXT: v_writelane_b32 v31, s10, 6 -; GCN-NEXT: v_writelane_b32 v31, s11, 7 -; GCN-NEXT: v_writelane_b32 v31, s12, 8 -; GCN-NEXT: v_writelane_b32 v31, s13, 9 -; GCN-NEXT: v_writelane_b32 v31, s14, 10 -; GCN-NEXT: v_writelane_b32 v31, s15, 11 -; GCN-NEXT: v_writelane_b32 v31, s16, 12 -; GCN-NEXT: v_writelane_b32 v31, s17, 13 -; GCN-NEXT: v_writelane_b32 v31, s18, 14 -; GCN-NEXT: v_writelane_b32 v31, s19, 15 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: v_writelane_b32 v0, s4, 0 +; GCN-NEXT: v_writelane_b32 v0, s5, 1 +; GCN-NEXT: v_writelane_b32 v0, s6, 2 +; GCN-NEXT: v_writelane_b32 v0, s7, 3 +; GCN-NEXT: v_writelane_b32 v0, s8, 4 +; GCN-NEXT: v_writelane_b32 v0, s9, 5 +; GCN-NEXT: v_writelane_b32 v0, s10, 6 +; GCN-NEXT: v_writelane_b32 v0, s11, 7 +; GCN-NEXT: v_writelane_b32 v0, s12, 8 +; GCN-NEXT: v_writelane_b32 v0, s13, 9 +; GCN-NEXT: v_writelane_b32 v0, s14, 10 +; GCN-NEXT: v_writelane_b32 v0, s15, 11 +; GCN-NEXT: v_writelane_b32 v0, s16, 12 +; GCN-NEXT: v_writelane_b32 v0, s17, 13 +; GCN-NEXT: v_writelane_b32 v0, s18, 14 +; GCN-NEXT: v_writelane_b32 v0, s19, 15 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v31, s4, 16 -; GCN-NEXT: v_writelane_b32 v31, s5, 17 -; GCN-NEXT: v_writelane_b32 v31, s6, 18 -; GCN-NEXT: v_writelane_b32 v31, s7, 19 -; GCN-NEXT: v_writelane_b32 v31, s8, 20 -; GCN-NEXT: v_writelane_b32 v31, s9, 21 -; GCN-NEXT: v_writelane_b32 v31, s10, 22 -; GCN-NEXT: v_writelane_b32 v31, s11, 23 -; GCN-NEXT: v_writelane_b32 v31, s12, 24 -; GCN-NEXT: v_writelane_b32 v31, s13, 25 -; GCN-NEXT: v_writelane_b32 v31, s14, 26 -; GCN-NEXT: v_writelane_b32 v31, s15, 27 -; GCN-NEXT: v_writelane_b32 v31, s16, 28 -; GCN-NEXT: v_writelane_b32 v31, s17, 29 -; GCN-NEXT: v_writelane_b32 v31, s18, 30 -; GCN-NEXT: v_writelane_b32 v31, s19, 31 +; GCN-NEXT: v_writelane_b32 v0, s4, 16 +; GCN-NEXT: v_writelane_b32 v0, s5, 17 +; GCN-NEXT: v_writelane_b32 v0, s6, 18 +; GCN-NEXT: v_writelane_b32 v0, s7, 19 +; GCN-NEXT: v_writelane_b32 v0, s8, 20 +; GCN-NEXT: v_writelane_b32 v0, s9, 21 +; GCN-NEXT: v_writelane_b32 v0, s10, 22 +; GCN-NEXT: v_writelane_b32 v0, s11, 23 +; GCN-NEXT: v_writelane_b32 v0, s12, 24 +; GCN-NEXT: v_writelane_b32 v0, s13, 25 +; GCN-NEXT: v_writelane_b32 v0, s14, 26 +; GCN-NEXT: v_writelane_b32 v0, s15, 27 +; GCN-NEXT: v_writelane_b32 v0, s16, 28 +; GCN-NEXT: v_writelane_b32 v0, s17, 29 +; GCN-NEXT: v_writelane_b32 v0, s18, 30 +; GCN-NEXT: v_writelane_b32 v0, s19, 31 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v31, s4, 32 -; GCN-NEXT: v_writelane_b32 v31, s5, 33 -; GCN-NEXT: v_writelane_b32 v31, s6, 34 -; GCN-NEXT: v_writelane_b32 v31, s7, 35 -; GCN-NEXT: v_writelane_b32 v31, s8, 36 -; GCN-NEXT: v_writelane_b32 v31, s9, 37 -; GCN-NEXT: v_writelane_b32 v31, s10, 38 -; GCN-NEXT: v_writelane_b32 v31, s11, 39 -; GCN-NEXT: v_writelane_b32 v31, s12, 40 -; GCN-NEXT: v_writelane_b32 v31, s13, 41 -; GCN-NEXT: v_writelane_b32 v31, s14, 42 -; GCN-NEXT: v_writelane_b32 v31, s15, 43 -; GCN-NEXT: v_writelane_b32 v31, s16, 44 -; GCN-NEXT: v_writelane_b32 v31, s17, 45 -; GCN-NEXT: v_writelane_b32 v31, s18, 46 -; GCN-NEXT: v_writelane_b32 v31, s19, 47 +; GCN-NEXT: v_writelane_b32 v0, s4, 32 +; GCN-NEXT: v_writelane_b32 v0, s5, 33 +; GCN-NEXT: v_writelane_b32 v0, s6, 34 +; GCN-NEXT: v_writelane_b32 v0, s7, 35 +; GCN-NEXT: v_writelane_b32 v0, s8, 36 +; GCN-NEXT: v_writelane_b32 v0, s9, 37 +; GCN-NEXT: v_writelane_b32 v0, s10, 38 +; GCN-NEXT: v_writelane_b32 v0, s11, 39 +; GCN-NEXT: v_writelane_b32 v0, s12, 40 +; GCN-NEXT: v_writelane_b32 v0, s13, 41 +; GCN-NEXT: v_writelane_b32 v0, s14, 42 +; GCN-NEXT: v_writelane_b32 v0, s15, 43 +; GCN-NEXT: v_writelane_b32 v0, s16, 44 +; GCN-NEXT: v_writelane_b32 v0, s17, 45 +; GCN-NEXT: v_writelane_b32 v0, s18, 46 +; GCN-NEXT: v_writelane_b32 v0, s19, 47 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v31, s4, 48 -; GCN-NEXT: v_writelane_b32 v31, s5, 49 -; GCN-NEXT: v_writelane_b32 v31, s6, 50 -; GCN-NEXT: v_writelane_b32 v31, s7, 51 -; GCN-NEXT: v_writelane_b32 v31, s8, 52 -; GCN-NEXT: v_writelane_b32 v31, s9, 53 -; GCN-NEXT: v_writelane_b32 v31, s10, 54 -; GCN-NEXT: v_writelane_b32 v31, s11, 55 -; GCN-NEXT: v_writelane_b32 v31, s12, 56 -; GCN-NEXT: v_writelane_b32 v31, s13, 57 -; GCN-NEXT: v_writelane_b32 v31, s14, 58 -; GCN-NEXT: v_writelane_b32 v31, s15, 59 -; GCN-NEXT: v_writelane_b32 v31, s16, 60 -; GCN-NEXT: v_writelane_b32 v31, s17, 61 -; GCN-NEXT: v_writelane_b32 v31, s18, 62 -; GCN-NEXT: v_writelane_b32 v31, s19, 63 +; GCN-NEXT: v_writelane_b32 v0, s4, 48 +; GCN-NEXT: v_writelane_b32 v0, s5, 49 +; GCN-NEXT: v_writelane_b32 v0, s6, 50 +; GCN-NEXT: v_writelane_b32 v0, s7, 51 +; GCN-NEXT: v_writelane_b32 v0, s8, 52 +; GCN-NEXT: v_writelane_b32 v0, s9, 53 +; GCN-NEXT: v_writelane_b32 v0, s10, 54 +; GCN-NEXT: v_writelane_b32 v0, s11, 55 +; GCN-NEXT: v_writelane_b32 v0, s12, 56 +; GCN-NEXT: v_writelane_b32 v0, s13, 57 +; GCN-NEXT: v_writelane_b32 v0, s14, 58 +; GCN-NEXT: v_writelane_b32 v0, s15, 59 +; GCN-NEXT: v_writelane_b32 v0, s16, 60 +; GCN-NEXT: v_writelane_b32 v0, s17, 61 +; GCN-NEXT: v_writelane_b32 v0, s18, 62 +; GCN-NEXT: v_writelane_b32 v0, s19, 63 +; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[2:3] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_mov_b64 s[4:5], exec -; GCN-NEXT: s_mov_b64 exec, 3 -; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 +; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: v_writelane_b32 v0, s2, 0 ; GCN-NEXT: v_writelane_b32 v0, s3, 1 +; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s0, s1 ; GCN-NEXT: s_cbranch_scc1 .LBB2_2 ; GCN-NEXT: ; %bb.1: ; %bb0 -; GCN-NEXT: v_readlane_b32 s36, v31, 32 -; GCN-NEXT: v_readlane_b32 s37, v31, 33 -; GCN-NEXT: v_readlane_b32 s38, v31, 34 -; GCN-NEXT: v_readlane_b32 s39, v31, 35 -; GCN-NEXT: v_readlane_b32 s40, v31, 36 -; GCN-NEXT: v_readlane_b32 s41, v31, 37 -; GCN-NEXT: v_readlane_b32 s42, v31, 38 -; GCN-NEXT: v_readlane_b32 s43, v31, 39 -; GCN-NEXT: v_readlane_b32 s44, v31, 40 -; GCN-NEXT: v_readlane_b32 s45, v31, 41 -; GCN-NEXT: v_readlane_b32 s46, v31, 42 -; GCN-NEXT: v_readlane_b32 s47, v31, 43 -; GCN-NEXT: v_readlane_b32 s48, v31, 44 -; GCN-NEXT: v_readlane_b32 s49, v31, 45 -; GCN-NEXT: v_readlane_b32 s50, v31, 46 -; GCN-NEXT: v_readlane_b32 s51, v31, 47 -; GCN-NEXT: v_readlane_b32 s0, v31, 16 -; GCN-NEXT: v_readlane_b32 s1, v31, 17 -; GCN-NEXT: v_readlane_b32 s2, v31, 18 -; GCN-NEXT: v_readlane_b32 s3, v31, 19 -; GCN-NEXT: v_readlane_b32 s4, v31, 20 -; GCN-NEXT: v_readlane_b32 s5, v31, 21 -; GCN-NEXT: v_readlane_b32 s6, v31, 22 -; GCN-NEXT: v_readlane_b32 s7, v31, 23 -; GCN-NEXT: v_readlane_b32 s8, v31, 24 -; GCN-NEXT: v_readlane_b32 s9, v31, 25 -; GCN-NEXT: v_readlane_b32 s10, v31, 26 -; GCN-NEXT: v_readlane_b32 s11, v31, 27 -; GCN-NEXT: v_readlane_b32 s12, v31, 28 -; GCN-NEXT: v_readlane_b32 s13, v31, 29 -; GCN-NEXT: v_readlane_b32 s14, v31, 30 -; GCN-NEXT: v_readlane_b32 s15, v31, 31 -; GCN-NEXT: v_readlane_b32 s16, v31, 0 -; GCN-NEXT: v_readlane_b32 s17, v31, 1 -; GCN-NEXT: v_readlane_b32 s18, v31, 2 -; GCN-NEXT: v_readlane_b32 s19, v31, 3 -; GCN-NEXT: v_readlane_b32 s20, v31, 4 -; GCN-NEXT: v_readlane_b32 s21, v31, 5 -; GCN-NEXT: v_readlane_b32 s22, v31, 6 -; GCN-NEXT: v_readlane_b32 s23, v31, 7 -; GCN-NEXT: v_readlane_b32 s24, v31, 8 -; GCN-NEXT: v_readlane_b32 s25, v31, 9 -; GCN-NEXT: v_readlane_b32 s26, v31, 10 -; GCN-NEXT: v_readlane_b32 s27, v31, 11 -; GCN-NEXT: v_readlane_b32 s28, v31, 12 -; GCN-NEXT: v_readlane_b32 s29, v31, 13 -; GCN-NEXT: v_readlane_b32 s30, v31, 14 -; GCN-NEXT: v_readlane_b32 s31, v31, 15 +; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[34:35] +; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[34:35] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readlane_b32 s36, v1, 32 +; GCN-NEXT: v_readlane_b32 s37, v1, 33 +; GCN-NEXT: v_readlane_b32 s38, v1, 34 +; GCN-NEXT: v_readlane_b32 s39, v1, 35 +; GCN-NEXT: v_readlane_b32 s40, v1, 36 +; GCN-NEXT: v_readlane_b32 s41, v1, 37 +; GCN-NEXT: v_readlane_b32 s42, v1, 38 +; GCN-NEXT: v_readlane_b32 s43, v1, 39 +; GCN-NEXT: v_readlane_b32 s44, v1, 40 +; GCN-NEXT: v_readlane_b32 s45, v1, 41 +; GCN-NEXT: v_readlane_b32 s46, v1, 42 +; GCN-NEXT: v_readlane_b32 s47, v1, 43 +; GCN-NEXT: v_readlane_b32 s48, v1, 44 +; GCN-NEXT: v_readlane_b32 s49, v1, 45 +; GCN-NEXT: v_readlane_b32 s50, v1, 46 +; GCN-NEXT: v_readlane_b32 s51, v1, 47 +; GCN-NEXT: v_readlane_b32 s0, v1, 16 +; GCN-NEXT: v_readlane_b32 s1, v1, 17 +; GCN-NEXT: v_readlane_b32 s2, v1, 18 +; GCN-NEXT: v_readlane_b32 s3, v1, 19 +; GCN-NEXT: v_readlane_b32 s4, v1, 20 +; GCN-NEXT: v_readlane_b32 s5, v1, 21 +; GCN-NEXT: v_readlane_b32 s6, v1, 22 +; GCN-NEXT: v_readlane_b32 s7, v1, 23 +; GCN-NEXT: v_readlane_b32 s8, v1, 24 +; GCN-NEXT: v_readlane_b32 s9, v1, 25 +; GCN-NEXT: v_readlane_b32 s10, v1, 26 +; GCN-NEXT: v_readlane_b32 s11, v1, 27 +; GCN-NEXT: v_readlane_b32 s12, v1, 28 +; GCN-NEXT: v_readlane_b32 s13, v1, 29 +; GCN-NEXT: v_readlane_b32 s14, v1, 30 +; GCN-NEXT: v_readlane_b32 s15, v1, 31 +; GCN-NEXT: v_readlane_b32 s16, v1, 0 +; GCN-NEXT: v_readlane_b32 s17, v1, 1 +; GCN-NEXT: v_readlane_b32 s18, v1, 2 +; GCN-NEXT: v_readlane_b32 s19, v1, 3 +; GCN-NEXT: v_readlane_b32 s20, v1, 4 +; GCN-NEXT: v_readlane_b32 s21, v1, 5 +; GCN-NEXT: v_readlane_b32 s22, v1, 6 +; GCN-NEXT: v_readlane_b32 s23, v1, 7 +; GCN-NEXT: v_readlane_b32 s24, v1, 8 +; GCN-NEXT: v_readlane_b32 s25, v1, 9 +; GCN-NEXT: v_readlane_b32 s26, v1, 10 +; GCN-NEXT: v_readlane_b32 s27, v1, 11 +; GCN-NEXT: v_readlane_b32 s28, v1, 12 +; GCN-NEXT: v_readlane_b32 s29, v1, 13 +; GCN-NEXT: v_readlane_b32 s30, v1, 14 +; GCN-NEXT: v_readlane_b32 s31, v1, 15 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[16:31] ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:15] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s4, v31, 48 -; GCN-NEXT: v_readlane_b32 s5, v31, 49 -; GCN-NEXT: v_readlane_b32 s6, v31, 50 -; GCN-NEXT: v_readlane_b32 s7, v31, 51 -; GCN-NEXT: v_readlane_b32 s8, v31, 52 -; GCN-NEXT: v_readlane_b32 s9, v31, 53 -; GCN-NEXT: v_readlane_b32 s10, v31, 54 -; GCN-NEXT: v_readlane_b32 s11, v31, 55 -; GCN-NEXT: v_readlane_b32 s12, v31, 56 -; GCN-NEXT: v_readlane_b32 s13, v31, 57 -; GCN-NEXT: v_readlane_b32 s14, v31, 58 -; GCN-NEXT: v_readlane_b32 s15, v31, 59 -; GCN-NEXT: v_readlane_b32 s16, v31, 60 -; GCN-NEXT: v_readlane_b32 s17, v31, 61 -; GCN-NEXT: v_readlane_b32 s18, v31, 62 -; GCN-NEXT: v_readlane_b32 s19, v31, 63 -; GCN-NEXT: s_mov_b64 s[2:3], exec -; GCN-NEXT: s_mov_b64 exec, 3 -; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 -; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readlane_b32 s4, v1, 48 +; GCN-NEXT: v_readlane_b32 s5, v1, 49 +; GCN-NEXT: v_readlane_b32 s6, v1, 50 +; GCN-NEXT: v_readlane_b32 s7, v1, 51 +; GCN-NEXT: v_readlane_b32 s8, v1, 52 +; GCN-NEXT: v_readlane_b32 s9, v1, 53 +; GCN-NEXT: v_readlane_b32 s10, v1, 54 +; GCN-NEXT: v_readlane_b32 s11, v1, 55 +; GCN-NEXT: v_readlane_b32 s12, v1, 56 +; GCN-NEXT: v_readlane_b32 s13, v1, 57 +; GCN-NEXT: v_readlane_b32 s14, v1, 58 +; GCN-NEXT: v_readlane_b32 s15, v1, 59 +; GCN-NEXT: v_readlane_b32 s16, v1, 60 +; GCN-NEXT: v_readlane_b32 s17, v1, 61 +; GCN-NEXT: v_readlane_b32 s18, v1, 62 +; GCN-NEXT: v_readlane_b32 s19, v1, 63 ; GCN-NEXT: v_readlane_b32 s0, v0, 0 ; GCN-NEXT: v_readlane_b32 s1, v0, 1 -; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_mov_b64 exec, s[2:3] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[36:51] ; GCN-NEXT: ;;#ASMEND @@ -920,144 +969,152 @@ ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v31, s4, 0 -; GCN-NEXT: v_writelane_b32 v31, s5, 1 -; GCN-NEXT: v_writelane_b32 v31, s6, 2 -; GCN-NEXT: v_writelane_b32 v31, s7, 3 -; GCN-NEXT: v_writelane_b32 v31, s8, 4 -; GCN-NEXT: v_writelane_b32 v31, s9, 5 -; GCN-NEXT: v_writelane_b32 v31, s10, 6 -; GCN-NEXT: v_writelane_b32 v31, s11, 7 -; GCN-NEXT: v_writelane_b32 v31, s12, 8 -; GCN-NEXT: v_writelane_b32 v31, s13, 9 -; GCN-NEXT: v_writelane_b32 v31, s14, 10 -; GCN-NEXT: v_writelane_b32 v31, s15, 11 -; GCN-NEXT: v_writelane_b32 v31, s16, 12 -; GCN-NEXT: v_writelane_b32 v31, s17, 13 -; GCN-NEXT: v_writelane_b32 v31, s18, 14 -; GCN-NEXT: v_writelane_b32 v31, s19, 15 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: v_writelane_b32 v0, s4, 0 +; GCN-NEXT: v_writelane_b32 v0, s5, 1 +; GCN-NEXT: v_writelane_b32 v0, s6, 2 +; GCN-NEXT: v_writelane_b32 v0, s7, 3 +; GCN-NEXT: v_writelane_b32 v0, s8, 4 +; GCN-NEXT: v_writelane_b32 v0, s9, 5 +; GCN-NEXT: v_writelane_b32 v0, s10, 6 +; GCN-NEXT: v_writelane_b32 v0, s11, 7 +; GCN-NEXT: v_writelane_b32 v0, s12, 8 +; GCN-NEXT: v_writelane_b32 v0, s13, 9 +; GCN-NEXT: v_writelane_b32 v0, s14, 10 +; GCN-NEXT: v_writelane_b32 v0, s15, 11 +; GCN-NEXT: v_writelane_b32 v0, s16, 12 +; GCN-NEXT: v_writelane_b32 v0, s17, 13 +; GCN-NEXT: v_writelane_b32 v0, s18, 14 +; GCN-NEXT: v_writelane_b32 v0, s19, 15 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v31, s4, 16 -; GCN-NEXT: v_writelane_b32 v31, s5, 17 -; GCN-NEXT: v_writelane_b32 v31, s6, 18 -; GCN-NEXT: v_writelane_b32 v31, s7, 19 -; GCN-NEXT: v_writelane_b32 v31, s8, 20 -; GCN-NEXT: v_writelane_b32 v31, s9, 21 -; GCN-NEXT: v_writelane_b32 v31, s10, 22 -; GCN-NEXT: v_writelane_b32 v31, s11, 23 -; GCN-NEXT: v_writelane_b32 v31, s12, 24 -; GCN-NEXT: v_writelane_b32 v31, s13, 25 -; GCN-NEXT: v_writelane_b32 v31, s14, 26 -; GCN-NEXT: v_writelane_b32 v31, s15, 27 -; GCN-NEXT: v_writelane_b32 v31, s16, 28 -; GCN-NEXT: v_writelane_b32 v31, s17, 29 -; GCN-NEXT: v_writelane_b32 v31, s18, 30 -; GCN-NEXT: v_writelane_b32 v31, s19, 31 +; GCN-NEXT: v_writelane_b32 v0, s4, 16 +; GCN-NEXT: v_writelane_b32 v0, s5, 17 +; GCN-NEXT: v_writelane_b32 v0, s6, 18 +; GCN-NEXT: v_writelane_b32 v0, s7, 19 +; GCN-NEXT: v_writelane_b32 v0, s8, 20 +; GCN-NEXT: v_writelane_b32 v0, s9, 21 +; GCN-NEXT: v_writelane_b32 v0, s10, 22 +; GCN-NEXT: v_writelane_b32 v0, s11, 23 +; GCN-NEXT: v_writelane_b32 v0, s12, 24 +; GCN-NEXT: v_writelane_b32 v0, s13, 25 +; GCN-NEXT: v_writelane_b32 v0, s14, 26 +; GCN-NEXT: v_writelane_b32 v0, s15, 27 +; GCN-NEXT: v_writelane_b32 v0, s16, 28 +; GCN-NEXT: v_writelane_b32 v0, s17, 29 +; GCN-NEXT: v_writelane_b32 v0, s18, 30 +; GCN-NEXT: v_writelane_b32 v0, s19, 31 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v31, s4, 32 -; GCN-NEXT: v_writelane_b32 v31, s5, 33 -; GCN-NEXT: v_writelane_b32 v31, s6, 34 -; GCN-NEXT: v_writelane_b32 v31, s7, 35 -; GCN-NEXT: v_writelane_b32 v31, s8, 36 -; GCN-NEXT: v_writelane_b32 v31, s9, 37 -; GCN-NEXT: v_writelane_b32 v31, s10, 38 -; GCN-NEXT: v_writelane_b32 v31, s11, 39 -; GCN-NEXT: v_writelane_b32 v31, s12, 40 -; GCN-NEXT: v_writelane_b32 v31, s13, 41 -; GCN-NEXT: v_writelane_b32 v31, s14, 42 -; GCN-NEXT: v_writelane_b32 v31, s15, 43 -; GCN-NEXT: v_writelane_b32 v31, s16, 44 -; GCN-NEXT: v_writelane_b32 v31, s17, 45 -; GCN-NEXT: v_writelane_b32 v31, s18, 46 -; GCN-NEXT: v_writelane_b32 v31, s19, 47 +; GCN-NEXT: v_writelane_b32 v0, s4, 32 +; GCN-NEXT: v_writelane_b32 v0, s5, 33 +; GCN-NEXT: v_writelane_b32 v0, s6, 34 +; GCN-NEXT: v_writelane_b32 v0, s7, 35 +; GCN-NEXT: v_writelane_b32 v0, s8, 36 +; GCN-NEXT: v_writelane_b32 v0, s9, 37 +; GCN-NEXT: v_writelane_b32 v0, s10, 38 +; GCN-NEXT: v_writelane_b32 v0, s11, 39 +; GCN-NEXT: v_writelane_b32 v0, s12, 40 +; GCN-NEXT: v_writelane_b32 v0, s13, 41 +; GCN-NEXT: v_writelane_b32 v0, s14, 42 +; GCN-NEXT: v_writelane_b32 v0, s15, 43 +; GCN-NEXT: v_writelane_b32 v0, s16, 44 +; GCN-NEXT: v_writelane_b32 v0, s17, 45 +; GCN-NEXT: v_writelane_b32 v0, s18, 46 +; GCN-NEXT: v_writelane_b32 v0, s19, 47 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v31, s4, 48 -; GCN-NEXT: v_writelane_b32 v31, s5, 49 -; GCN-NEXT: v_writelane_b32 v31, s6, 50 -; GCN-NEXT: v_writelane_b32 v31, s7, 51 -; GCN-NEXT: v_writelane_b32 v31, s8, 52 -; GCN-NEXT: v_writelane_b32 v31, s9, 53 -; GCN-NEXT: v_writelane_b32 v31, s10, 54 -; GCN-NEXT: v_writelane_b32 v31, s11, 55 -; GCN-NEXT: v_writelane_b32 v31, s12, 56 -; GCN-NEXT: v_writelane_b32 v31, s13, 57 -; GCN-NEXT: v_writelane_b32 v31, s14, 58 -; GCN-NEXT: v_writelane_b32 v31, s15, 59 -; GCN-NEXT: v_writelane_b32 v31, s16, 60 -; GCN-NEXT: v_writelane_b32 v31, s17, 61 -; GCN-NEXT: v_writelane_b32 v31, s18, 62 -; GCN-NEXT: v_writelane_b32 v31, s19, 63 +; GCN-NEXT: v_writelane_b32 v0, s4, 48 +; GCN-NEXT: v_writelane_b32 v0, s5, 49 +; GCN-NEXT: v_writelane_b32 v0, s6, 50 +; GCN-NEXT: v_writelane_b32 v0, s7, 51 +; GCN-NEXT: v_writelane_b32 v0, s8, 52 +; GCN-NEXT: v_writelane_b32 v0, s9, 53 +; GCN-NEXT: v_writelane_b32 v0, s10, 54 +; GCN-NEXT: v_writelane_b32 v0, s11, 55 +; GCN-NEXT: v_writelane_b32 v0, s12, 56 +; GCN-NEXT: v_writelane_b32 v0, s13, 57 +; GCN-NEXT: v_writelane_b32 v0, s14, 58 +; GCN-NEXT: v_writelane_b32 v0, s15, 59 +; GCN-NEXT: v_writelane_b32 v0, s16, 60 +; GCN-NEXT: v_writelane_b32 v0, s17, 61 +; GCN-NEXT: v_writelane_b32 v0, s18, 62 +; GCN-NEXT: v_writelane_b32 v0, s19, 63 +; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[2:3] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_mov_b64 s[4:5], exec -; GCN-NEXT: s_mov_b64 exec, 3 -; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 +; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: v_writelane_b32 v0, s2, 0 ; GCN-NEXT: v_writelane_b32 v0, s3, 1 +; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s0, s1 ; GCN-NEXT: s_cbranch_scc1 .LBB3_2 ; GCN-NEXT: ; %bb.1: ; %bb0 -; GCN-NEXT: v_readlane_b32 s36, v31, 32 -; GCN-NEXT: v_readlane_b32 s37, v31, 33 -; GCN-NEXT: v_readlane_b32 s38, v31, 34 -; GCN-NEXT: v_readlane_b32 s39, v31, 35 -; GCN-NEXT: v_readlane_b32 s40, v31, 36 -; GCN-NEXT: v_readlane_b32 s41, v31, 37 -; GCN-NEXT: v_readlane_b32 s42, v31, 38 -; GCN-NEXT: v_readlane_b32 s43, v31, 39 -; GCN-NEXT: v_readlane_b32 s44, v31, 40 -; GCN-NEXT: v_readlane_b32 s45, v31, 41 -; GCN-NEXT: v_readlane_b32 s46, v31, 42 -; GCN-NEXT: v_readlane_b32 s47, v31, 43 -; GCN-NEXT: v_readlane_b32 s48, v31, 44 -; GCN-NEXT: v_readlane_b32 s49, v31, 45 -; GCN-NEXT: v_readlane_b32 s50, v31, 46 -; GCN-NEXT: v_readlane_b32 s51, v31, 47 -; GCN-NEXT: v_readlane_b32 s0, v31, 16 -; GCN-NEXT: v_readlane_b32 s1, v31, 17 -; GCN-NEXT: v_readlane_b32 s2, v31, 18 -; GCN-NEXT: v_readlane_b32 s3, v31, 19 -; GCN-NEXT: v_readlane_b32 s4, v31, 20 -; GCN-NEXT: v_readlane_b32 s5, v31, 21 -; GCN-NEXT: v_readlane_b32 s6, v31, 22 -; GCN-NEXT: v_readlane_b32 s7, v31, 23 -; GCN-NEXT: v_readlane_b32 s8, v31, 24 -; GCN-NEXT: v_readlane_b32 s9, v31, 25 -; GCN-NEXT: v_readlane_b32 s10, v31, 26 -; GCN-NEXT: v_readlane_b32 s11, v31, 27 -; GCN-NEXT: v_readlane_b32 s12, v31, 28 -; GCN-NEXT: v_readlane_b32 s13, v31, 29 -; GCN-NEXT: v_readlane_b32 s14, v31, 30 -; GCN-NEXT: v_readlane_b32 s15, v31, 31 -; GCN-NEXT: v_readlane_b32 s16, v31, 0 -; GCN-NEXT: v_readlane_b32 s17, v31, 1 -; GCN-NEXT: v_readlane_b32 s18, v31, 2 -; GCN-NEXT: v_readlane_b32 s19, v31, 3 -; GCN-NEXT: v_readlane_b32 s20, v31, 4 -; GCN-NEXT: v_readlane_b32 s21, v31, 5 -; GCN-NEXT: v_readlane_b32 s22, v31, 6 -; GCN-NEXT: v_readlane_b32 s23, v31, 7 -; GCN-NEXT: v_readlane_b32 s24, v31, 8 -; GCN-NEXT: v_readlane_b32 s25, v31, 9 -; GCN-NEXT: v_readlane_b32 s26, v31, 10 -; GCN-NEXT: v_readlane_b32 s27, v31, 11 -; GCN-NEXT: v_readlane_b32 s28, v31, 12 -; GCN-NEXT: v_readlane_b32 s29, v31, 13 -; GCN-NEXT: v_readlane_b32 s30, v31, 14 -; GCN-NEXT: v_readlane_b32 s31, v31, 15 +; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[34:35] +; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GCN-NEXT: buffer_load_dword v2, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[34:35] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readlane_b32 s36, v2, 32 +; GCN-NEXT: v_readlane_b32 s37, v2, 33 +; GCN-NEXT: v_readlane_b32 s38, v2, 34 +; GCN-NEXT: v_readlane_b32 s39, v2, 35 +; GCN-NEXT: v_readlane_b32 s40, v2, 36 +; GCN-NEXT: v_readlane_b32 s41, v2, 37 +; GCN-NEXT: v_readlane_b32 s42, v2, 38 +; GCN-NEXT: v_readlane_b32 s43, v2, 39 +; GCN-NEXT: v_readlane_b32 s44, v2, 40 +; GCN-NEXT: v_readlane_b32 s45, v2, 41 +; GCN-NEXT: v_readlane_b32 s46, v2, 42 +; GCN-NEXT: v_readlane_b32 s47, v2, 43 +; GCN-NEXT: v_readlane_b32 s48, v2, 44 +; GCN-NEXT: v_readlane_b32 s49, v2, 45 +; GCN-NEXT: v_readlane_b32 s50, v2, 46 +; GCN-NEXT: v_readlane_b32 s51, v2, 47 +; GCN-NEXT: v_readlane_b32 s0, v2, 16 +; GCN-NEXT: v_readlane_b32 s1, v2, 17 +; GCN-NEXT: v_readlane_b32 s2, v2, 18 +; GCN-NEXT: v_readlane_b32 s3, v2, 19 +; GCN-NEXT: v_readlane_b32 s4, v2, 20 +; GCN-NEXT: v_readlane_b32 s5, v2, 21 +; GCN-NEXT: v_readlane_b32 s6, v2, 22 +; GCN-NEXT: v_readlane_b32 s7, v2, 23 +; GCN-NEXT: v_readlane_b32 s8, v2, 24 +; GCN-NEXT: v_readlane_b32 s9, v2, 25 +; GCN-NEXT: v_readlane_b32 s10, v2, 26 +; GCN-NEXT: v_readlane_b32 s11, v2, 27 +; GCN-NEXT: v_readlane_b32 s12, v2, 28 +; GCN-NEXT: v_readlane_b32 s13, v2, 29 +; GCN-NEXT: v_readlane_b32 s14, v2, 30 +; GCN-NEXT: v_readlane_b32 s15, v2, 31 +; GCN-NEXT: v_readlane_b32 s16, v2, 0 +; GCN-NEXT: v_readlane_b32 s17, v2, 1 +; GCN-NEXT: v_readlane_b32 s18, v2, 2 +; GCN-NEXT: v_readlane_b32 s19, v2, 3 +; GCN-NEXT: v_readlane_b32 s20, v2, 4 +; GCN-NEXT: v_readlane_b32 s21, v2, 5 +; GCN-NEXT: v_readlane_b32 s22, v2, 6 +; GCN-NEXT: v_readlane_b32 s23, v2, 7 +; GCN-NEXT: v_readlane_b32 s24, v2, 8 +; GCN-NEXT: v_readlane_b32 s25, v2, 9 +; GCN-NEXT: v_readlane_b32 s26, v2, 10 +; GCN-NEXT: v_readlane_b32 s27, v2, 11 +; GCN-NEXT: v_readlane_b32 s28, v2, 12 +; GCN-NEXT: v_readlane_b32 s29, v2, 13 +; GCN-NEXT: v_readlane_b32 s30, v2, 14 +; GCN-NEXT: v_readlane_b32 s31, v2, 15 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def v0 ; GCN-NEXT: ;;#ASMEND @@ -1067,32 +1124,24 @@ ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:15] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s4, v31, 48 -; GCN-NEXT: v_readlane_b32 s5, v31, 49 -; GCN-NEXT: v_readlane_b32 s6, v31, 50 -; GCN-NEXT: v_readlane_b32 s7, v31, 51 -; GCN-NEXT: v_readlane_b32 s8, v31, 52 -; GCN-NEXT: v_readlane_b32 s9, v31, 53 -; GCN-NEXT: v_readlane_b32 s10, v31, 54 -; GCN-NEXT: v_readlane_b32 s11, v31, 55 -; GCN-NEXT: v_readlane_b32 s12, v31, 56 -; GCN-NEXT: v_readlane_b32 s13, v31, 57 -; GCN-NEXT: v_readlane_b32 s14, v31, 58 -; GCN-NEXT: v_readlane_b32 s15, v31, 59 -; GCN-NEXT: v_readlane_b32 s16, v31, 60 -; GCN-NEXT: v_readlane_b32 s17, v31, 61 -; GCN-NEXT: v_readlane_b32 s18, v31, 62 -; GCN-NEXT: v_readlane_b32 s19, v31, 63 -; GCN-NEXT: s_mov_b64 s[2:3], exec -; GCN-NEXT: s_mov_b64 exec, 3 -; GCN-NEXT: buffer_store_dword v1, off, s[52:55], 0 -; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readlane_b32 s4, v2, 48 +; GCN-NEXT: v_readlane_b32 s5, v2, 49 +; GCN-NEXT: v_readlane_b32 s6, v2, 50 +; GCN-NEXT: v_readlane_b32 s7, v2, 51 +; GCN-NEXT: v_readlane_b32 s8, v2, 52 +; GCN-NEXT: v_readlane_b32 s9, v2, 53 +; GCN-NEXT: v_readlane_b32 s10, v2, 54 +; GCN-NEXT: v_readlane_b32 s11, v2, 55 +; GCN-NEXT: v_readlane_b32 s12, v2, 56 +; GCN-NEXT: v_readlane_b32 s13, v2, 57 +; GCN-NEXT: v_readlane_b32 s14, v2, 58 +; GCN-NEXT: v_readlane_b32 s15, v2, 59 +; GCN-NEXT: v_readlane_b32 s16, v2, 60 +; GCN-NEXT: v_readlane_b32 s17, v2, 61 +; GCN-NEXT: v_readlane_b32 s18, v2, 62 +; GCN-NEXT: v_readlane_b32 s19, v2, 63 ; GCN-NEXT: v_readlane_b32 s0, v1, 0 ; GCN-NEXT: v_readlane_b32 s1, v1, 1 -; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_mov_b64 exec, s[2:3] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[36:51] ; GCN-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll --- a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll @@ -1,22 +1,377 @@ -; RUN: not --crash llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -; This ends up needing to spill SGPRs to memory, and also does not -; have any free SGPRs available to save the exec mask when doing so. -; The register scavenger also needs to use the emergency stack slot, -; which tries to place the scavenged register restore instruction as -; far the block as possible, near the terminator. This places a -; restore instruction between the condition and the conditional -; branch, which gets expanded into a sequence involving s_not_b64 on -; the exec mask, clobbering SCC value before the branch. We probably -; have to stop relying on being able to flip and restore the exec -; mask, and always require a free SGPR for saving exec. +; This was a negative test to catch an extreme case when all options are exhausted +; while trying to spill SGPRs to memory. After we enabled SGPR spills into virtual VGPRs +; the edge case won't arise and the test would always compile. -; CHECK: *** Bad machine code: Using an undefined physical register *** -; CHECK-NEXT: - function: kernel0 -; CHECK-NEXT: - basic block: %bb.0 -; CHECK-NEXT: - instruction: S_CBRANCH_SCC1 %bb.2, implicit killed $scc -; CHECK-NEXT: - operand 1: implicit killed $scc define amdgpu_kernel void @kernel0(i32 addrspace(1)* %out, i32 %in) #1 { +; CHECK-LABEL: kernel0: +; CHECK: ; %bb.0: +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[2:3] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: s_load_dword s0, s[4:5], 0x8 +; CHECK-NEXT: v_writelane_b32 v0, s2, 0 +; CHECK-NEXT: v_writelane_b32 v0, s3, 1 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[4:7] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v0, s4, 2 +; CHECK-NEXT: v_writelane_b32 v0, s5, 3 +; CHECK-NEXT: v_writelane_b32 v0, s6, 4 +; CHECK-NEXT: v_writelane_b32 v0, s7, 5 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[4:11] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v0, s4, 6 +; CHECK-NEXT: v_writelane_b32 v0, s5, 7 +; CHECK-NEXT: v_writelane_b32 v0, s6, 8 +; CHECK-NEXT: v_writelane_b32 v0, s7, 9 +; CHECK-NEXT: v_writelane_b32 v0, s8, 10 +; CHECK-NEXT: v_writelane_b32 v0, s9, 11 +; CHECK-NEXT: v_writelane_b32 v0, s10, 12 +; CHECK-NEXT: v_writelane_b32 v0, s11, 13 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[4:19] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v0, s4, 14 +; CHECK-NEXT: v_writelane_b32 v0, s5, 15 +; CHECK-NEXT: v_writelane_b32 v0, s6, 16 +; CHECK-NEXT: v_writelane_b32 v0, s7, 17 +; CHECK-NEXT: v_writelane_b32 v0, s8, 18 +; CHECK-NEXT: v_writelane_b32 v0, s9, 19 +; CHECK-NEXT: v_writelane_b32 v0, s10, 20 +; CHECK-NEXT: v_writelane_b32 v0, s11, 21 +; CHECK-NEXT: v_writelane_b32 v0, s12, 22 +; CHECK-NEXT: v_writelane_b32 v0, s13, 23 +; CHECK-NEXT: v_writelane_b32 v0, s14, 24 +; CHECK-NEXT: v_writelane_b32 v0, s15, 25 +; CHECK-NEXT: v_writelane_b32 v0, s16, 26 +; CHECK-NEXT: v_writelane_b32 v0, s17, 27 +; CHECK-NEXT: v_writelane_b32 v0, s18, 28 +; CHECK-NEXT: v_writelane_b32 v0, s19, 29 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[2:3] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v0, s2, 30 +; CHECK-NEXT: v_writelane_b32 v0, s3, 31 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[4:7] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v0, s4, 32 +; CHECK-NEXT: v_writelane_b32 v0, s5, 33 +; CHECK-NEXT: v_writelane_b32 v0, s6, 34 +; CHECK-NEXT: v_writelane_b32 v0, s7, 35 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[4:11] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v0, s4, 36 +; CHECK-NEXT: v_writelane_b32 v0, s5, 37 +; CHECK-NEXT: v_writelane_b32 v0, s6, 38 +; CHECK-NEXT: v_writelane_b32 v0, s7, 39 +; CHECK-NEXT: v_writelane_b32 v0, s8, 40 +; CHECK-NEXT: v_writelane_b32 v0, s9, 41 +; CHECK-NEXT: v_writelane_b32 v0, s10, 42 +; CHECK-NEXT: v_writelane_b32 v0, s11, 43 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[16:31] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[52:53] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[48:51] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[36:43] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[0:15] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v0, s0, 44 +; CHECK-NEXT: v_writelane_b32 v0, s1, 45 +; CHECK-NEXT: v_writelane_b32 v0, s2, 46 +; CHECK-NEXT: v_writelane_b32 v0, s3, 47 +; CHECK-NEXT: v_writelane_b32 v0, s4, 48 +; CHECK-NEXT: v_writelane_b32 v0, s5, 49 +; CHECK-NEXT: v_writelane_b32 v0, s6, 50 +; CHECK-NEXT: v_writelane_b32 v0, s7, 51 +; CHECK-NEXT: v_writelane_b32 v0, s8, 52 +; CHECK-NEXT: v_writelane_b32 v0, s9, 53 +; CHECK-NEXT: v_writelane_b32 v0, s10, 54 +; CHECK-NEXT: v_writelane_b32 v0, s11, 55 +; CHECK-NEXT: v_writelane_b32 v0, s12, 56 +; CHECK-NEXT: v_writelane_b32 v0, s13, 57 +; CHECK-NEXT: v_writelane_b32 v0, s14, 58 +; CHECK-NEXT: v_writelane_b32 v0, s15, 59 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[34:35] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[44:47] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[0:7] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ; implicit-def: $vgpr1 +; CHECK-NEXT: v_writelane_b32 v0, s0, 60 +; CHECK-NEXT: v_writelane_b32 v1, s4, 0 +; CHECK-NEXT: v_writelane_b32 v0, s1, 61 +; CHECK-NEXT: v_writelane_b32 v1, s5, 1 +; CHECK-NEXT: v_writelane_b32 v0, s2, 62 +; CHECK-NEXT: v_writelane_b32 v1, s6, 2 +; CHECK-NEXT: v_writelane_b32 v0, s3, 63 +; CHECK-NEXT: v_writelane_b32 v1, s7, 3 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[0:15] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v1, s0, 4 +; CHECK-NEXT: v_writelane_b32 v1, s1, 5 +; CHECK-NEXT: v_writelane_b32 v1, s2, 6 +; CHECK-NEXT: v_writelane_b32 v1, s3, 7 +; CHECK-NEXT: v_writelane_b32 v1, s4, 8 +; CHECK-NEXT: v_writelane_b32 v1, s5, 9 +; CHECK-NEXT: v_writelane_b32 v1, s6, 10 +; CHECK-NEXT: v_writelane_b32 v1, s7, 11 +; CHECK-NEXT: v_writelane_b32 v1, s8, 12 +; CHECK-NEXT: v_writelane_b32 v1, s9, 13 +; CHECK-NEXT: v_writelane_b32 v1, s10, 14 +; CHECK-NEXT: v_writelane_b32 v1, s11, 15 +; CHECK-NEXT: v_writelane_b32 v1, s12, 16 +; CHECK-NEXT: v_writelane_b32 v1, s13, 17 +; CHECK-NEXT: v_writelane_b32 v1, s14, 18 +; CHECK-NEXT: v_writelane_b32 v1, s15, 19 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[54:55] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[0:3] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v1, s0, 20 +; CHECK-NEXT: v_writelane_b32 v1, s1, 21 +; CHECK-NEXT: v_writelane_b32 v1, s2, 22 +; CHECK-NEXT: v_writelane_b32 v1, s3, 23 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[0:7] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v1, s0, 24 +; CHECK-NEXT: v_writelane_b32 v1, s1, 25 +; CHECK-NEXT: v_writelane_b32 v1, s2, 26 +; CHECK-NEXT: v_writelane_b32 v1, s3, 27 +; CHECK-NEXT: v_writelane_b32 v1, s4, 28 +; CHECK-NEXT: v_writelane_b32 v1, s5, 29 +; CHECK-NEXT: v_writelane_b32 v1, s6, 30 +; CHECK-NEXT: v_writelane_b32 v1, s7, 31 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[0:15] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v1, s0, 32 +; CHECK-NEXT: v_writelane_b32 v1, s1, 33 +; CHECK-NEXT: v_writelane_b32 v1, s2, 34 +; CHECK-NEXT: v_writelane_b32 v1, s3, 35 +; CHECK-NEXT: v_writelane_b32 v1, s4, 36 +; CHECK-NEXT: v_writelane_b32 v1, s5, 37 +; CHECK-NEXT: v_writelane_b32 v1, s6, 38 +; CHECK-NEXT: v_writelane_b32 v1, s7, 39 +; CHECK-NEXT: v_writelane_b32 v1, s8, 40 +; CHECK-NEXT: v_writelane_b32 v1, s9, 41 +; CHECK-NEXT: v_writelane_b32 v1, s10, 42 +; CHECK-NEXT: v_writelane_b32 v1, s11, 43 +; CHECK-NEXT: v_writelane_b32 v1, s12, 44 +; CHECK-NEXT: v_writelane_b32 v1, s13, 45 +; CHECK-NEXT: v_writelane_b32 v1, s14, 46 +; CHECK-NEXT: v_writelane_b32 v1, s15, 47 +; CHECK-NEXT: s_cbranch_scc0 .LBB0_2 +; CHECK-NEXT: ; %bb.1: ; %ret +; CHECK-NEXT: s_endpgm +; CHECK-NEXT: .LBB0_2: ; %bb0 +; CHECK-NEXT: v_readlane_b32 s0, v0, 0 +; CHECK-NEXT: v_readlane_b32 s1, v0, 1 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:1] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s0, v0, 2 +; CHECK-NEXT: v_readlane_b32 s1, v0, 3 +; CHECK-NEXT: v_readlane_b32 s2, v0, 4 +; CHECK-NEXT: v_readlane_b32 s3, v0, 5 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:3] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s0, v0, 6 +; CHECK-NEXT: v_readlane_b32 s1, v0, 7 +; CHECK-NEXT: v_readlane_b32 s2, v0, 8 +; CHECK-NEXT: v_readlane_b32 s3, v0, 9 +; CHECK-NEXT: v_readlane_b32 s4, v0, 10 +; CHECK-NEXT: v_readlane_b32 s5, v0, 11 +; CHECK-NEXT: v_readlane_b32 s6, v0, 12 +; CHECK-NEXT: v_readlane_b32 s7, v0, 13 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:7] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s0, v0, 14 +; CHECK-NEXT: v_readlane_b32 s1, v0, 15 +; CHECK-NEXT: v_readlane_b32 s2, v0, 16 +; CHECK-NEXT: v_readlane_b32 s3, v0, 17 +; CHECK-NEXT: v_readlane_b32 s4, v0, 18 +; CHECK-NEXT: v_readlane_b32 s5, v0, 19 +; CHECK-NEXT: v_readlane_b32 s6, v0, 20 +; CHECK-NEXT: v_readlane_b32 s7, v0, 21 +; CHECK-NEXT: v_readlane_b32 s8, v0, 22 +; CHECK-NEXT: v_readlane_b32 s9, v0, 23 +; CHECK-NEXT: v_readlane_b32 s10, v0, 24 +; CHECK-NEXT: v_readlane_b32 s11, v0, 25 +; CHECK-NEXT: v_readlane_b32 s12, v0, 26 +; CHECK-NEXT: v_readlane_b32 s13, v0, 27 +; CHECK-NEXT: v_readlane_b32 s14, v0, 28 +; CHECK-NEXT: v_readlane_b32 s15, v0, 29 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:15] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s0, v0, 30 +; CHECK-NEXT: v_readlane_b32 s1, v0, 31 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:1] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s0, v0, 32 +; CHECK-NEXT: v_readlane_b32 s1, v0, 33 +; CHECK-NEXT: v_readlane_b32 s2, v0, 34 +; CHECK-NEXT: v_readlane_b32 s3, v0, 35 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:3] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s0, v0, 36 +; CHECK-NEXT: v_readlane_b32 s1, v0, 37 +; CHECK-NEXT: v_readlane_b32 s2, v0, 38 +; CHECK-NEXT: v_readlane_b32 s3, v0, 39 +; CHECK-NEXT: v_readlane_b32 s4, v0, 40 +; CHECK-NEXT: v_readlane_b32 s5, v0, 41 +; CHECK-NEXT: v_readlane_b32 s6, v0, 42 +; CHECK-NEXT: v_readlane_b32 s7, v0, 43 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:7] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s0, v0, 44 +; CHECK-NEXT: v_readlane_b32 s1, v0, 45 +; CHECK-NEXT: v_readlane_b32 s2, v0, 46 +; CHECK-NEXT: v_readlane_b32 s3, v0, 47 +; CHECK-NEXT: v_readlane_b32 s4, v0, 48 +; CHECK-NEXT: v_readlane_b32 s5, v0, 49 +; CHECK-NEXT: v_readlane_b32 s6, v0, 50 +; CHECK-NEXT: v_readlane_b32 s7, v0, 51 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[16:31] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[52:53] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[48:51] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[36:43] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s8, v0, 52 +; CHECK-NEXT: v_readlane_b32 s9, v0, 53 +; CHECK-NEXT: v_readlane_b32 s10, v0, 54 +; CHECK-NEXT: v_readlane_b32 s11, v0, 55 +; CHECK-NEXT: v_readlane_b32 s12, v0, 56 +; CHECK-NEXT: v_readlane_b32 s13, v0, 57 +; CHECK-NEXT: v_readlane_b32 s14, v0, 58 +; CHECK-NEXT: v_readlane_b32 s15, v0, 59 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:15] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s0, v0, 60 +; CHECK-NEXT: v_readlane_b32 s1, v0, 61 +; CHECK-NEXT: v_readlane_b32 s2, v0, 62 +; CHECK-NEXT: v_readlane_b32 s3, v0, 63 +; CHECK-NEXT: v_readlane_b32 s4, v1, 0 +; CHECK-NEXT: v_readlane_b32 s5, v1, 1 +; CHECK-NEXT: v_readlane_b32 s6, v1, 2 +; CHECK-NEXT: v_readlane_b32 s7, v1, 3 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[34:35] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[44:47] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:7] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s0, v1, 4 +; CHECK-NEXT: v_readlane_b32 s1, v1, 5 +; CHECK-NEXT: v_readlane_b32 s2, v1, 6 +; CHECK-NEXT: v_readlane_b32 s3, v1, 7 +; CHECK-NEXT: v_readlane_b32 s4, v1, 8 +; CHECK-NEXT: v_readlane_b32 s5, v1, 9 +; CHECK-NEXT: v_readlane_b32 s6, v1, 10 +; CHECK-NEXT: v_readlane_b32 s7, v1, 11 +; CHECK-NEXT: v_readlane_b32 s8, v1, 12 +; CHECK-NEXT: v_readlane_b32 s9, v1, 13 +; CHECK-NEXT: v_readlane_b32 s10, v1, 14 +; CHECK-NEXT: v_readlane_b32 s11, v1, 15 +; CHECK-NEXT: v_readlane_b32 s12, v1, 16 +; CHECK-NEXT: v_readlane_b32 s13, v1, 17 +; CHECK-NEXT: v_readlane_b32 s14, v1, 18 +; CHECK-NEXT: v_readlane_b32 s15, v1, 19 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:15] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s0, v1, 20 +; CHECK-NEXT: v_readlane_b32 s1, v1, 21 +; CHECK-NEXT: v_readlane_b32 s2, v1, 22 +; CHECK-NEXT: v_readlane_b32 s3, v1, 23 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[54:55] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:3] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s0, v1, 24 +; CHECK-NEXT: v_readlane_b32 s1, v1, 25 +; CHECK-NEXT: v_readlane_b32 s2, v1, 26 +; CHECK-NEXT: v_readlane_b32 s3, v1, 27 +; CHECK-NEXT: v_readlane_b32 s4, v1, 28 +; CHECK-NEXT: v_readlane_b32 s5, v1, 29 +; CHECK-NEXT: v_readlane_b32 s6, v1, 30 +; CHECK-NEXT: v_readlane_b32 s7, v1, 31 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:7] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s0, v1, 32 +; CHECK-NEXT: v_readlane_b32 s1, v1, 33 +; CHECK-NEXT: v_readlane_b32 s2, v1, 34 +; CHECK-NEXT: v_readlane_b32 s3, v1, 35 +; CHECK-NEXT: v_readlane_b32 s4, v1, 36 +; CHECK-NEXT: v_readlane_b32 s5, v1, 37 +; CHECK-NEXT: v_readlane_b32 s6, v1, 38 +; CHECK-NEXT: v_readlane_b32 s7, v1, 39 +; CHECK-NEXT: v_readlane_b32 s8, v1, 40 +; CHECK-NEXT: v_readlane_b32 s9, v1, 41 +; CHECK-NEXT: v_readlane_b32 s10, v1, 42 +; CHECK-NEXT: v_readlane_b32 s11, v1, 43 +; CHECK-NEXT: v_readlane_b32 s12, v1, 44 +; CHECK-NEXT: v_readlane_b32 s13, v1, 45 +; CHECK-NEXT: v_readlane_b32 s14, v1, 46 +; CHECK-NEXT: v_readlane_b32 s15, v1, 47 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:15] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_endpgm call void asm sideeffect "", "~{v[0:7]}" () #0 call void asm sideeffect "", "~{v[8:15]}" () #0 call void asm sideeffect "", "~{v[16:19]}"() #0 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-dead-frame-in-dbg-value.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-dead-frame-in-dbg-value.mir --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-dead-frame-in-dbg-value.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-dead-frame-in-dbg-value.mir @@ -1,4 +1,6 @@ -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=true -verify-machineinstrs -run-pass=si-lower-sgpr-spills,prologepilog -o - %s | FileCheck %s +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=true -verify-machineinstrs -run-pass=si-lower-sgpr-spills -o - %s | FileCheck -check-prefix=SGPR_SPILL %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=true -verify-machineinstrs --start-before=si-lower-sgpr-spills --stop-after=prologepilog -o - %s | FileCheck -check-prefix=PEI %s # After handling the SGPR spill to VGPR in SILowerSGPRSpills pass, replace the dead frame index in the DBG_VALUE instruction with reg 0. # Otherwise, the test would crash during PEI while trying to replace the dead frame index. @@ -39,13 +41,21 @@ workGroupIDX: { reg: '$sgpr8' } privateSegmentWaveByteOffset: { reg: '$sgpr9' } body: | - ; CHECK-LABEL: name: test - ; CHECK: bb.0: - ; CHECK: $vgpr0 = V_WRITELANE_B32 killed $sgpr10, 0, $vgpr0 - ; CHECK: DBG_VALUE $noreg, 0 - ; CHECK: bb.1: - ; CHECK: $sgpr10 = V_READLANE_B32 $vgpr0, 0 - ; CHECK: S_ENDPGM 0 + ; SGPR_SPILL-LABEL: name: test + ; SGPR_SPILL: bb.0: + ; SGPR_SPILL: [[VGPR:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; SGPR_SPILL: [[VGPR]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr10, 0, [[VGPR]] + ; SGPR_SPILL: DBG_VALUE $noreg, 0 + ; SGPR_SPILL: bb.1: + ; SGPR_SPILL: $sgpr10 = V_READLANE_B32 [[VGPR]], 0 + ; SGPR_SPILL: S_ENDPGM 0 + ; PEI-LABEL: name: test + ; PEI: bb.0: + ; PEI: renamable $[[VGPR:vgpr[0-9]+]] = IMPLICIT_DEF + ; PEI: renamable $[[VGPR]] = V_WRITELANE_B32 killed $sgpr10, 0, killed $[[VGPR]] + ; PEI: bb.1: + ; PEI: $sgpr10 = V_READLANE_B32 killed $[[VGPR]], 0 + ; PEI: S_ENDPGM 0 bb.0: renamable $sgpr10 = IMPLICIT_DEF SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-fi-skip-processing-stack-arg-dbg-value.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-fi-skip-processing-stack-arg-dbg-value.mir --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-fi-skip-processing-stack-arg-dbg-value.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-fi-skip-processing-stack-arg-dbg-value.mir @@ -1,4 +1,4 @@ -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=true -verify-machineinstrs -run-pass=si-lower-sgpr-spills,prologepilog -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=true -verify-machineinstrs -run-pass=si-lower-sgpr-spills -o - %s | FileCheck %s # After handling the SGPR spill to VGPR in SILowerSGPRSpills pass, we replace the dead frame index in the DBG_VALUE instruction with reg 0. # Skip looking for frame indices in the debug value instruction for incoming arguments passed via stack. The test would crash otherwise. @@ -45,7 +45,7 @@ body: | ; CHECK-LABEL: name: test ; CHECK: bb.0: - ; CHECK: DBG_VALUE $noreg, 0 + ; CHECK: DBG_VALUE bb.0: renamable $sgpr10 = IMPLICIT_DEF SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll @@ -1,8 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s -; The first 64 SGPR spills can go to a VGPR, but there isn't a second -; so some spills must be to memory. The last 16 element spill runs out of lanes at the 15th element. +; This test was originally written when SGPRs are spilled directly to physical VGPRs and +; stressed a case when there wasn't enough VGPRs to accommodate all spills. +; When we started spilling them into virtual VGPR lanes, we always succeed in doing so. +; The regalloc pass later takes care of allocating VGPRs to these virtual registers. define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(i32 addrspace(1)* %out, i32 %in) #1 { ; GCN-LABEL: partial_no_vgprs_last_sgpr_spill: @@ -23,179 +25,179 @@ ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[8:23] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v23, s8, 0 -; GCN-NEXT: v_writelane_b32 v23, s9, 1 -; GCN-NEXT: v_writelane_b32 v23, s10, 2 -; GCN-NEXT: v_writelane_b32 v23, s11, 3 -; GCN-NEXT: v_writelane_b32 v23, s12, 4 -; GCN-NEXT: v_writelane_b32 v23, s13, 5 -; GCN-NEXT: v_writelane_b32 v23, s14, 6 -; GCN-NEXT: v_writelane_b32 v23, s15, 7 -; GCN-NEXT: v_writelane_b32 v23, s16, 8 -; GCN-NEXT: v_writelane_b32 v23, s17, 9 -; GCN-NEXT: v_writelane_b32 v23, s18, 10 -; GCN-NEXT: v_writelane_b32 v23, s19, 11 -; GCN-NEXT: v_writelane_b32 v23, s20, 12 -; GCN-NEXT: v_writelane_b32 v23, s21, 13 -; GCN-NEXT: v_writelane_b32 v23, s22, 14 -; GCN-NEXT: v_writelane_b32 v23, s23, 15 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: v_writelane_b32 v0, s8, 0 +; GCN-NEXT: v_writelane_b32 v0, s9, 1 +; GCN-NEXT: v_writelane_b32 v0, s10, 2 +; GCN-NEXT: v_writelane_b32 v0, s11, 3 +; GCN-NEXT: v_writelane_b32 v0, s12, 4 +; GCN-NEXT: v_writelane_b32 v0, s13, 5 +; GCN-NEXT: v_writelane_b32 v0, s14, 6 +; GCN-NEXT: v_writelane_b32 v0, s15, 7 +; GCN-NEXT: v_writelane_b32 v0, s16, 8 +; GCN-NEXT: v_writelane_b32 v0, s17, 9 +; GCN-NEXT: v_writelane_b32 v0, s18, 10 +; GCN-NEXT: v_writelane_b32 v0, s19, 11 +; GCN-NEXT: v_writelane_b32 v0, s20, 12 +; GCN-NEXT: v_writelane_b32 v0, s21, 13 +; GCN-NEXT: v_writelane_b32 v0, s22, 14 +; GCN-NEXT: v_writelane_b32 v0, s23, 15 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[8:23] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v23, s8, 16 -; GCN-NEXT: v_writelane_b32 v23, s9, 17 -; GCN-NEXT: v_writelane_b32 v23, s10, 18 -; GCN-NEXT: v_writelane_b32 v23, s11, 19 -; GCN-NEXT: v_writelane_b32 v23, s12, 20 -; GCN-NEXT: v_writelane_b32 v23, s13, 21 -; GCN-NEXT: v_writelane_b32 v23, s14, 22 -; GCN-NEXT: v_writelane_b32 v23, s15, 23 -; GCN-NEXT: v_writelane_b32 v23, s16, 24 -; GCN-NEXT: v_writelane_b32 v23, s17, 25 -; GCN-NEXT: v_writelane_b32 v23, s18, 26 -; GCN-NEXT: v_writelane_b32 v23, s19, 27 -; GCN-NEXT: v_writelane_b32 v23, s20, 28 -; GCN-NEXT: v_writelane_b32 v23, s21, 29 -; GCN-NEXT: v_writelane_b32 v23, s22, 30 -; GCN-NEXT: v_writelane_b32 v23, s23, 31 +; GCN-NEXT: v_writelane_b32 v0, s8, 16 +; GCN-NEXT: v_writelane_b32 v0, s9, 17 +; GCN-NEXT: v_writelane_b32 v0, s10, 18 +; GCN-NEXT: v_writelane_b32 v0, s11, 19 +; GCN-NEXT: v_writelane_b32 v0, s12, 20 +; GCN-NEXT: v_writelane_b32 v0, s13, 21 +; GCN-NEXT: v_writelane_b32 v0, s14, 22 +; GCN-NEXT: v_writelane_b32 v0, s15, 23 +; GCN-NEXT: v_writelane_b32 v0, s16, 24 +; GCN-NEXT: v_writelane_b32 v0, s17, 25 +; GCN-NEXT: v_writelane_b32 v0, s18, 26 +; GCN-NEXT: v_writelane_b32 v0, s19, 27 +; GCN-NEXT: v_writelane_b32 v0, s20, 28 +; GCN-NEXT: v_writelane_b32 v0, s21, 29 +; GCN-NEXT: v_writelane_b32 v0, s22, 30 +; GCN-NEXT: v_writelane_b32 v0, s23, 31 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[8:23] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v23, s8, 32 -; GCN-NEXT: v_writelane_b32 v23, s9, 33 -; GCN-NEXT: v_writelane_b32 v23, s10, 34 -; GCN-NEXT: v_writelane_b32 v23, s11, 35 -; GCN-NEXT: v_writelane_b32 v23, s12, 36 -; GCN-NEXT: v_writelane_b32 v23, s13, 37 -; GCN-NEXT: v_writelane_b32 v23, s14, 38 -; GCN-NEXT: v_writelane_b32 v23, s15, 39 -; GCN-NEXT: v_writelane_b32 v23, s16, 40 -; GCN-NEXT: v_writelane_b32 v23, s17, 41 -; GCN-NEXT: v_writelane_b32 v23, s18, 42 -; GCN-NEXT: v_writelane_b32 v23, s19, 43 -; GCN-NEXT: v_writelane_b32 v23, s20, 44 -; GCN-NEXT: v_writelane_b32 v23, s21, 45 -; GCN-NEXT: v_writelane_b32 v23, s22, 46 -; GCN-NEXT: v_writelane_b32 v23, s23, 47 +; GCN-NEXT: v_writelane_b32 v0, s8, 32 +; GCN-NEXT: v_writelane_b32 v0, s9, 33 +; GCN-NEXT: v_writelane_b32 v0, s10, 34 +; GCN-NEXT: v_writelane_b32 v0, s11, 35 +; GCN-NEXT: v_writelane_b32 v0, s12, 36 +; GCN-NEXT: v_writelane_b32 v0, s13, 37 +; GCN-NEXT: v_writelane_b32 v0, s14, 38 +; GCN-NEXT: v_writelane_b32 v0, s15, 39 +; GCN-NEXT: v_writelane_b32 v0, s16, 40 +; GCN-NEXT: v_writelane_b32 v0, s17, 41 +; GCN-NEXT: v_writelane_b32 v0, s18, 42 +; GCN-NEXT: v_writelane_b32 v0, s19, 43 +; GCN-NEXT: v_writelane_b32 v0, s20, 44 +; GCN-NEXT: v_writelane_b32 v0, s21, 45 +; GCN-NEXT: v_writelane_b32 v0, s22, 46 +; GCN-NEXT: v_writelane_b32 v0, s23, 47 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[8:23] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v23, s8, 48 -; GCN-NEXT: v_writelane_b32 v23, s9, 49 -; GCN-NEXT: v_writelane_b32 v23, s10, 50 -; GCN-NEXT: v_writelane_b32 v23, s11, 51 -; GCN-NEXT: v_writelane_b32 v23, s12, 52 -; GCN-NEXT: v_writelane_b32 v23, s13, 53 -; GCN-NEXT: v_writelane_b32 v23, s14, 54 -; GCN-NEXT: v_writelane_b32 v23, s15, 55 -; GCN-NEXT: v_writelane_b32 v23, s16, 56 -; GCN-NEXT: v_writelane_b32 v23, s17, 57 -; GCN-NEXT: v_writelane_b32 v23, s18, 58 -; GCN-NEXT: v_writelane_b32 v23, s19, 59 -; GCN-NEXT: v_writelane_b32 v23, s20, 60 -; GCN-NEXT: v_writelane_b32 v23, s21, 61 -; GCN-NEXT: v_writelane_b32 v23, s22, 62 -; GCN-NEXT: v_writelane_b32 v23, s23, 63 +; GCN-NEXT: v_writelane_b32 v0, s8, 48 +; GCN-NEXT: v_writelane_b32 v0, s9, 49 +; GCN-NEXT: v_writelane_b32 v0, s10, 50 +; GCN-NEXT: v_writelane_b32 v0, s11, 51 +; GCN-NEXT: v_writelane_b32 v0, s12, 52 +; GCN-NEXT: v_writelane_b32 v0, s13, 53 +; GCN-NEXT: v_writelane_b32 v0, s14, 54 +; GCN-NEXT: v_writelane_b32 v0, s15, 55 +; GCN-NEXT: v_writelane_b32 v0, s16, 56 +; GCN-NEXT: v_writelane_b32 v0, s17, 57 +; GCN-NEXT: v_writelane_b32 v0, s18, 58 +; GCN-NEXT: v_writelane_b32 v0, s19, 59 +; GCN-NEXT: v_writelane_b32 v0, s20, 60 +; GCN-NEXT: v_writelane_b32 v0, s21, 61 +; GCN-NEXT: v_writelane_b32 v0, s22, 62 +; GCN-NEXT: v_writelane_b32 v0, s23, 63 +; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[24:25] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[6:7] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_mov_b64 s[8:9], exec -; GCN-NEXT: s_mov_b64 exec, 3 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: v_writelane_b32 v0, s6, 0 ; GCN-NEXT: v_writelane_b32 v0, s7, 1 +; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: s_mov_b64 exec, s[24:25] ; GCN-NEXT: s_mov_b32 s5, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s4, s5 ; GCN-NEXT: s_cbranch_scc1 .LBB0_2 ; GCN-NEXT: ; %bb.1: ; %bb0 -; GCN-NEXT: v_readlane_b32 s4, v23, 0 -; GCN-NEXT: v_readlane_b32 s5, v23, 1 -; GCN-NEXT: v_readlane_b32 s6, v23, 2 -; GCN-NEXT: v_readlane_b32 s7, v23, 3 -; GCN-NEXT: v_readlane_b32 s8, v23, 4 -; GCN-NEXT: v_readlane_b32 s9, v23, 5 -; GCN-NEXT: v_readlane_b32 s10, v23, 6 -; GCN-NEXT: v_readlane_b32 s11, v23, 7 -; GCN-NEXT: v_readlane_b32 s12, v23, 8 -; GCN-NEXT: v_readlane_b32 s13, v23, 9 -; GCN-NEXT: v_readlane_b32 s14, v23, 10 -; GCN-NEXT: v_readlane_b32 s15, v23, 11 -; GCN-NEXT: v_readlane_b32 s16, v23, 12 -; GCN-NEXT: v_readlane_b32 s17, v23, 13 -; GCN-NEXT: v_readlane_b32 s18, v23, 14 -; GCN-NEXT: v_readlane_b32 s19, v23, 15 +; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[24:25] +; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[24:25] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readlane_b32 s4, v1, 0 +; GCN-NEXT: v_readlane_b32 s5, v1, 1 +; GCN-NEXT: v_readlane_b32 s6, v1, 2 +; GCN-NEXT: v_readlane_b32 s7, v1, 3 +; GCN-NEXT: v_readlane_b32 s8, v1, 4 +; GCN-NEXT: v_readlane_b32 s9, v1, 5 +; GCN-NEXT: v_readlane_b32 s10, v1, 6 +; GCN-NEXT: v_readlane_b32 s11, v1, 7 +; GCN-NEXT: v_readlane_b32 s12, v1, 8 +; GCN-NEXT: v_readlane_b32 s13, v1, 9 +; GCN-NEXT: v_readlane_b32 s14, v1, 10 +; GCN-NEXT: v_readlane_b32 s15, v1, 11 +; GCN-NEXT: v_readlane_b32 s16, v1, 12 +; GCN-NEXT: v_readlane_b32 s17, v1, 13 +; GCN-NEXT: v_readlane_b32 s18, v1, 14 +; GCN-NEXT: v_readlane_b32 s19, v1, 15 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s4, v23, 16 -; GCN-NEXT: v_readlane_b32 s5, v23, 17 -; GCN-NEXT: v_readlane_b32 s6, v23, 18 -; GCN-NEXT: v_readlane_b32 s7, v23, 19 -; GCN-NEXT: v_readlane_b32 s8, v23, 20 -; GCN-NEXT: v_readlane_b32 s9, v23, 21 -; GCN-NEXT: v_readlane_b32 s10, v23, 22 -; GCN-NEXT: v_readlane_b32 s11, v23, 23 -; GCN-NEXT: v_readlane_b32 s12, v23, 24 -; GCN-NEXT: v_readlane_b32 s13, v23, 25 -; GCN-NEXT: v_readlane_b32 s14, v23, 26 -; GCN-NEXT: v_readlane_b32 s15, v23, 27 -; GCN-NEXT: v_readlane_b32 s16, v23, 28 -; GCN-NEXT: v_readlane_b32 s17, v23, 29 -; GCN-NEXT: v_readlane_b32 s18, v23, 30 -; GCN-NEXT: v_readlane_b32 s19, v23, 31 +; GCN-NEXT: v_readlane_b32 s4, v1, 16 +; GCN-NEXT: v_readlane_b32 s5, v1, 17 +; GCN-NEXT: v_readlane_b32 s6, v1, 18 +; GCN-NEXT: v_readlane_b32 s7, v1, 19 +; GCN-NEXT: v_readlane_b32 s8, v1, 20 +; GCN-NEXT: v_readlane_b32 s9, v1, 21 +; GCN-NEXT: v_readlane_b32 s10, v1, 22 +; GCN-NEXT: v_readlane_b32 s11, v1, 23 +; GCN-NEXT: v_readlane_b32 s12, v1, 24 +; GCN-NEXT: v_readlane_b32 s13, v1, 25 +; GCN-NEXT: v_readlane_b32 s14, v1, 26 +; GCN-NEXT: v_readlane_b32 s15, v1, 27 +; GCN-NEXT: v_readlane_b32 s16, v1, 28 +; GCN-NEXT: v_readlane_b32 s17, v1, 29 +; GCN-NEXT: v_readlane_b32 s18, v1, 30 +; GCN-NEXT: v_readlane_b32 s19, v1, 31 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s4, v23, 32 -; GCN-NEXT: v_readlane_b32 s5, v23, 33 -; GCN-NEXT: v_readlane_b32 s6, v23, 34 -; GCN-NEXT: v_readlane_b32 s7, v23, 35 -; GCN-NEXT: v_readlane_b32 s8, v23, 36 -; GCN-NEXT: v_readlane_b32 s9, v23, 37 -; GCN-NEXT: v_readlane_b32 s10, v23, 38 -; GCN-NEXT: v_readlane_b32 s11, v23, 39 -; GCN-NEXT: v_readlane_b32 s12, v23, 40 -; GCN-NEXT: v_readlane_b32 s13, v23, 41 -; GCN-NEXT: v_readlane_b32 s14, v23, 42 -; GCN-NEXT: v_readlane_b32 s15, v23, 43 -; GCN-NEXT: v_readlane_b32 s16, v23, 44 -; GCN-NEXT: v_readlane_b32 s17, v23, 45 -; GCN-NEXT: v_readlane_b32 s18, v23, 46 -; GCN-NEXT: v_readlane_b32 s19, v23, 47 +; GCN-NEXT: v_readlane_b32 s4, v1, 32 +; GCN-NEXT: v_readlane_b32 s5, v1, 33 +; GCN-NEXT: v_readlane_b32 s6, v1, 34 +; GCN-NEXT: v_readlane_b32 s7, v1, 35 +; GCN-NEXT: v_readlane_b32 s8, v1, 36 +; GCN-NEXT: v_readlane_b32 s9, v1, 37 +; GCN-NEXT: v_readlane_b32 s10, v1, 38 +; GCN-NEXT: v_readlane_b32 s11, v1, 39 +; GCN-NEXT: v_readlane_b32 s12, v1, 40 +; GCN-NEXT: v_readlane_b32 s13, v1, 41 +; GCN-NEXT: v_readlane_b32 s14, v1, 42 +; GCN-NEXT: v_readlane_b32 s15, v1, 43 +; GCN-NEXT: v_readlane_b32 s16, v1, 44 +; GCN-NEXT: v_readlane_b32 s17, v1, 45 +; GCN-NEXT: v_readlane_b32 s18, v1, 46 +; GCN-NEXT: v_readlane_b32 s19, v1, 47 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s8, v23, 48 -; GCN-NEXT: v_readlane_b32 s9, v23, 49 -; GCN-NEXT: v_readlane_b32 s10, v23, 50 -; GCN-NEXT: v_readlane_b32 s11, v23, 51 -; GCN-NEXT: v_readlane_b32 s12, v23, 52 -; GCN-NEXT: v_readlane_b32 s13, v23, 53 -; GCN-NEXT: v_readlane_b32 s14, v23, 54 -; GCN-NEXT: v_readlane_b32 s15, v23, 55 -; GCN-NEXT: v_readlane_b32 s16, v23, 56 -; GCN-NEXT: v_readlane_b32 s17, v23, 57 -; GCN-NEXT: v_readlane_b32 s18, v23, 58 -; GCN-NEXT: v_readlane_b32 s19, v23, 59 -; GCN-NEXT: v_readlane_b32 s20, v23, 60 -; GCN-NEXT: v_readlane_b32 s21, v23, 61 -; GCN-NEXT: v_readlane_b32 s22, v23, 62 -; GCN-NEXT: v_readlane_b32 s23, v23, 63 -; GCN-NEXT: s_mov_b64 s[6:7], exec -; GCN-NEXT: s_mov_b64 exec, 3 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readlane_b32 s8, v1, 48 +; GCN-NEXT: v_readlane_b32 s9, v1, 49 +; GCN-NEXT: v_readlane_b32 s10, v1, 50 +; GCN-NEXT: v_readlane_b32 s11, v1, 51 +; GCN-NEXT: v_readlane_b32 s12, v1, 52 +; GCN-NEXT: v_readlane_b32 s13, v1, 53 +; GCN-NEXT: v_readlane_b32 s14, v1, 54 +; GCN-NEXT: v_readlane_b32 s15, v1, 55 +; GCN-NEXT: v_readlane_b32 s16, v1, 56 +; GCN-NEXT: v_readlane_b32 s17, v1, 57 +; GCN-NEXT: v_readlane_b32 s18, v1, 58 +; GCN-NEXT: v_readlane_b32 s19, v1, 59 +; GCN-NEXT: v_readlane_b32 s20, v1, 60 +; GCN-NEXT: v_readlane_b32 s21, v1, 61 +; GCN-NEXT: v_readlane_b32 s22, v1, 62 +; GCN-NEXT: v_readlane_b32 s23, v1, 63 ; GCN-NEXT: v_readlane_b32 s4, v0, 0 ; GCN-NEXT: v_readlane_b32 s5, v0, 1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[8:23] ; GCN-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-partially-undef.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-partially-undef.mir --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-partially-undef.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-partially-undef.mir @@ -20,10 +20,11 @@ liveins: $sgpr4 ; CHECK-LABEL: name: sgpr_spill_s64_undef_high32 - ; CHECK: liveins: $sgpr4, $vgpr0 + ; CHECK: liveins: $sgpr4 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, $vgpr0, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5 - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr5, 1, $vgpr0, implicit $sgpr4_sgpr5 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5 + ; CHECK-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5 SI_SPILL_S64_SAVE renamable $sgpr4_sgpr5, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store (s64) into %stack.0, align 4, addrspace 5) ... @@ -45,10 +46,11 @@ liveins: $sgpr5 ; CHECK-LABEL: name: sgpr_spill_s64_undef_low32 - ; CHECK: liveins: $sgpr5, $vgpr0 + ; CHECK: liveins: $sgpr5 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, $vgpr0, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5 - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr5, 1, $vgpr0, implicit $sgpr4_sgpr5 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5 + ; CHECK-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5 SI_SPILL_S64_SAVE renamable $sgpr4_sgpr5, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store (s64) into %stack.0, align 4, addrspace 5) ... diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-update-only-slot-indexes.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spill-update-only-slot-indexes.ll --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-update-only-slot-indexes.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-update-only-slot-indexes.ll @@ -12,16 +12,17 @@ ; GCN-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; GCN-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GCN-NEXT: s_mov_b32 s38, -1 +; GCN-NEXT: ; implicit-def: $vgpr3 ; GCN-NEXT: s_mov_b32 s39, 0xe00000 -; GCN-NEXT: v_writelane_b32 v40, s4, 0 +; GCN-NEXT: v_writelane_b32 v3, s4, 0 ; GCN-NEXT: s_add_u32 s36, s36, s11 -; GCN-NEXT: v_writelane_b32 v40, s5, 1 +; GCN-NEXT: v_writelane_b32 v3, s5, 1 ; GCN-NEXT: s_addc_u32 s37, s37, 0 ; GCN-NEXT: s_mov_b64 s[4:5], s[0:1] -; GCN-NEXT: v_readlane_b32 s0, v40, 0 +; GCN-NEXT: v_readlane_b32 s0, v3, 0 ; GCN-NEXT: s_mov_b32 s13, s9 ; GCN-NEXT: s_mov_b32 s12, s8 -; GCN-NEXT: v_readlane_b32 s1, v40, 1 +; GCN-NEXT: v_readlane_b32 s1, v3, 1 ; GCN-NEXT: s_add_u32 s8, s0, 36 ; GCN-NEXT: s_addc_u32 s9, s1, 0 ; GCN-NEXT: s_getpc_b64 s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-vmem-large-frame.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-vmem-large-frame.mir --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-vmem-large-frame.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-vmem-large-frame.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=false -verify-machineinstrs -run-pass=si-lower-sgpr-spills,prologepilog -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=false -verify-machineinstrs -start-before=si-lower-sgpr-spills -stop-after=prologepilog -o - %s | FileCheck %s # Check that we allocate 2 emergency stack slots if we're spilling # SGPRs to memory and potentially have an offset larger than fits in @@ -29,7 +29,7 @@ ; CHECK-NEXT: $sgpr6_sgpr7 = S_MOV_B64 $exec ; CHECK-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr2 ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) - ; CHECK-NEXT: $vgpr2 = V_WRITELANE_B32 killed $sgpr10, 0, undef $vgpr2 + ; CHECK-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr10, 0, undef $vgpr2 ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) ; CHECK-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7, implicit killed $vgpr2 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll --- a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll @@ -16,10 +16,10 @@ ; GCN-LABEL: spill_sgpr_with_no_lower_vgpr_available: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, s33 +; GCN-NEXT: s_mov_b32 s14, s33 ; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v255, off, s[0:3], s33 offset:448 ; 4-byte Folded Spill +; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:452 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_add_i32 s32, s32, 0x7400 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:440 ; 4-byte Folded Spill @@ -133,13 +133,20 @@ ; GCN-NEXT: buffer_store_dword v252, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v253, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v254, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_writelane_b32 v255, s30, 0 -; GCN-NEXT: v_writelane_b32 v255, s31, 1 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: v_writelane_b32 v0, s30, 0 +; GCN-NEXT: v_writelane_b32 v0, s31, 1 +; GCN-NEXT: s_or_saveexec_b64 s[12:13], -1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:448 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[12:13] ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:444 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_or_saveexec_b64 s[12:13], -1 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:448 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[12:13] ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, child_function@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, child_function@gotpcrel32@hi+12 @@ -150,8 +157,8 @@ ; GCN-NEXT: s_mov_b64 s[2:3], s[10:11] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GCN-NEXT: v_readlane_b32 s31, v255, 1 -; GCN-NEXT: v_readlane_b32 s30, v255, 0 +; GCN-NEXT: v_readlane_b32 s31, v0, 1 +; GCN-NEXT: v_readlane_b32 s30, v0, 0 ; GCN-NEXT: buffer_load_dword v254, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v253, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v252, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload @@ -263,11 +270,11 @@ ; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:432 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:436 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:440 ; 4-byte Folded Reload -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v255, off, s[0:3], s33 offset:448 ; 4-byte Folded Reload +; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:452 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_add_i32 s32, s32, 0xffff8c00 -; GCN-NEXT: s_mov_b32 s33, s6 +; GCN-NEXT: s_mov_b32 s33, s14 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, align 4, addrspace(5) @@ -308,10 +315,10 @@ ; GCN-LABEL: spill_to_lowest_available_vgpr: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, s33 +; GCN-NEXT: s_mov_b32 s14, s33 ; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v254, off, s[0:3], s33 offset:444 ; 4-byte Folded Spill +; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:448 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_add_i32 s32, s32, 0x7400 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:436 ; 4-byte Folded Spill @@ -424,13 +431,20 @@ ; GCN-NEXT: buffer_store_dword v251, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v252, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v253, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_writelane_b32 v254, s30, 0 -; GCN-NEXT: v_writelane_b32 v254, s31, 1 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: v_writelane_b32 v0, s30, 0 +; GCN-NEXT: v_writelane_b32 v0, s31, 1 +; GCN-NEXT: s_or_saveexec_b64 s[12:13], -1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:444 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[12:13] ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:440 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_or_saveexec_b64 s[12:13], -1 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:444 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[12:13] ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, child_function@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, child_function@gotpcrel32@hi+12 @@ -441,8 +455,8 @@ ; GCN-NEXT: s_mov_b64 s[2:3], s[10:11] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GCN-NEXT: v_readlane_b32 s31, v254, 1 -; GCN-NEXT: v_readlane_b32 s30, v254, 0 +; GCN-NEXT: v_readlane_b32 s31, v0, 1 +; GCN-NEXT: v_readlane_b32 s30, v0, 0 ; GCN-NEXT: buffer_load_dword v253, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v252, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v251, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload @@ -553,11 +567,11 @@ ; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:428 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:432 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:436 ; 4-byte Folded Reload -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v254, off, s[0:3], s33 offset:444 ; 4-byte Folded Reload +; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:448 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_add_i32 s32, s32, 0xffff8c00 -; GCN-NEXT: s_mov_b32 s33, s6 +; GCN-NEXT: s_mov_b32 s33, s14 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, align 4, addrspace(5) @@ -598,8 +612,8 @@ ; GCN-LABEL: spill_sgpr_with_sgpr_uses: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v254, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill @@ -719,10 +733,18 @@ ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s4 ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v254, s4, 0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: v_writelane_b32 v0, s4, 0 +; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[8:9] ; GCN-NEXT: s_cbranch_scc1 .LBB3_2 ; GCN-NEXT: ; %bb.1: ; %bb0 -; GCN-NEXT: v_readlane_b32 s4, v254, 0 +; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readlane_b32 s4, v0, 0 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s4 ; GCN-NEXT: ;;#ASMEND @@ -837,8 +859,8 @@ ; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v254, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -1161,7 +1183,8 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill @@ -1275,45 +1298,54 @@ ; GCN-NEXT: buffer_store_dword v253, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v254, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v255, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: ; implicit-def: $vgpr4 ; GCN-NEXT: v_writelane_b32 v4, s34, 0 ; GCN-NEXT: v_writelane_b32 v4, s35, 1 ; GCN-NEXT: v_writelane_b32 v4, s36, 2 ; GCN-NEXT: v_writelane_b32 v4, s37, 3 +; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[8:9] ; GCN-NEXT: v_mov_b32_e32 v5, v3 -; GCN-NEXT: v_mov_b32_e32 v3, v1 +; GCN-NEXT: v_mov_b32_e32 v3, v2 +; GCN-NEXT: v_mov_b32_e32 v4, v1 +; GCN-NEXT: v_mov_b32_e32 v1, v0 +; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[8:9] ; GCN-NEXT: ; implicit-def: $sgpr4 ; GCN-NEXT: ; implicit-def: $sgpr4 -; GCN-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $exec -; GCN-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec -; GCN-NEXT: v_mov_b32_e32 v1, v3 +; GCN-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $exec +; GCN-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GCN-NEXT: v_mov_b32_e32 v2, v4 ; GCN-NEXT: ; implicit-def: $sgpr4 ; GCN-NEXT: ; implicit-def: $sgpr4 ; GCN-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $exec -; GCN-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GCN-NEXT: v_mov_b32_e32 v3, v5 +; GCN-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec +; GCN-NEXT: v_mov_b32_e32 v4, v5 ; GCN-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GCN-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GCN-NEXT: flat_load_dwordx4 v[5:8], v[2:3] +; GCN-NEXT: flat_load_dwordx4 v[3:6], v[3:4] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: flat_store_dwordx4 v[0:1], v[5:8] -; GCN-NEXT: v_readlane_b32 s37, v4, 3 -; GCN-NEXT: v_readlane_b32 s36, v4, 2 -; GCN-NEXT: v_readlane_b32 s35, v4, 1 -; GCN-NEXT: v_readlane_b32 s34, v4, 0 +; GCN-NEXT: flat_store_dwordx4 v[1:2], v[3:6] +; GCN-NEXT: v_readlane_b32 s37, v0, 3 +; GCN-NEXT: v_readlane_b32 s36, v0, 2 +; GCN-NEXT: v_readlane_b32 s35, v0, 1 +; GCN-NEXT: v_readlane_b32 s34, v0, 0 ; GCN-NEXT: buffer_load_dword v255, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v254, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v253, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -1427,7 +1459,8 @@ ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -1508,8 +1541,11 @@ ; GCN-LABEL: spill_sgpr_no_free_vgpr_ipra: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, s33 +; GCN-NEXT: s_mov_b32 s14, s33 ; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:452 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_add_i32 s32, s32, 0x7400 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:444 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:440 ; 4-byte Folded Spill @@ -1623,21 +1659,11 @@ ; GCN-NEXT: buffer_store_dword v253, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v254, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v255, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 s[14:15], exec -; GCN-NEXT: s_mov_b64 exec, 1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:456 -; GCN-NEXT: v_writelane_b32 v1, s30, 0 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:448 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:456 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_mov_b64 exec, s[14:15] -; GCN-NEXT: s_mov_b64 s[12:13], exec -; GCN-NEXT: s_mov_b64 exec, 1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:456 -; GCN-NEXT: v_writelane_b32 v0, s31, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:452 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:456 -; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: v_writelane_b32 v0, s30, 0 +; GCN-NEXT: v_writelane_b32 v0, s31, 1 +; GCN-NEXT: s_or_saveexec_b64 s[12:13], -1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:448 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[12:13] ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, child_function_ipra@rel32@lo+4 @@ -1647,24 +1673,12 @@ ; GCN-NEXT: s_mov_b64 s[0:1], s[8:9] ; GCN-NEXT: s_mov_b64 s[2:3], s[10:11] ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GCN-NEXT: s_mov_b64 s[8:9], exec -; GCN-NEXT: s_mov_b64 exec, 1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:456 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:452 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readlane_b32 s31, v1, 0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:456 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_mov_b64 exec, s[8:9] -; GCN-NEXT: s_mov_b64 s[4:5], exec -; GCN-NEXT: s_mov_b64 exec, 1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:456 +; GCN-NEXT: s_or_saveexec_b64 s[12:13], -1 ; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:448 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[12:13] ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readlane_b32 s31, v0, 1 ; GCN-NEXT: v_readlane_b32 s30, v0, 0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:456 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: buffer_load_dword v255, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v254, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v253, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload @@ -1777,8 +1791,11 @@ ; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:436 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:440 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:444 ; 4-byte Folded Reload +; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:452 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_add_i32 s32, s32, 0xffff8c00 -; GCN-NEXT: s_mov_b32 s33, s6 +; GCN-NEXT: s_mov_b32 s33, s14 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] call void @child_function_ipra() diff --git a/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll b/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll --- a/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll +++ b/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll @@ -6,16 +6,16 @@ ; ALL: s_mov_b32 s[[HI:[0-9]+]], 0xe80000 ; Make sure we are handling hazards correctly. -; SGPR: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 +; SGPR: v_mov_b32_e32 v0, vcc_lo +; SGPR-NEXT: s_or_saveexec_b64 [[EXEC_COPY:s\[[0-9]+:[0-9]+\]]], -1 +; SGPR-NEXT: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 ; 4-byte Folded Reload +; SGPR-NEXT: s_mov_b64 exec, [[EXEC_COPY]] ; SGPR-NEXT: s_waitcnt vmcnt(0) ; SGPR-NEXT: v_readlane_b32 s{{[0-9]+}}, [[VHI]], 0 ; SGPR-NEXT: v_readlane_b32 s{{[0-9]+}}, [[VHI]], 1 ; SGPR-NEXT: v_readlane_b32 s{{[0-9]+}}, [[VHI]], 2 ; SGPR-NEXT: v_readlane_b32 s[[HI:[0-9]+]], [[VHI]], 3 -; SGPR-NEXT: buffer_load_dword [[VHI]], off, s[96:99], 0 -; SGPR-NEXT: s_waitcnt vmcnt(0) -; SGPR-NEXT: s_mov_b64 exec, s[4:5] -; SGPR-NEXT: s_nop 1 +; SGPR-NEXT: s_nop 4 ; SGPR-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; ALL: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll --- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll +++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll @@ -212,15 +212,15 @@ ; GCN-DAG: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12 ; GCN-DAG: v_writelane_b32 [[CSRV]], s30, 0 -; GCN-DAG: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-DAG: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-DAG: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-DAG: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-DAG: v_writelane_b32 [[CSRV]], s31, 1 ; GCN: s_swappc_b64 -; GCN-DAG: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-DAG: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-DAG: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-DAG: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, sibling_call_i32_fastcc_i32_i32@rel32@lo+4 diff --git a/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll b/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll --- a/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll @@ -2,20 +2,22 @@ ; GCN-LABEL: {{^}}spill_csr_s5_copy: ; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 -; GCN: s_or_saveexec_b64 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN: s_xor_saveexec_b64 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec -; GCN: v_writelane_b32 v41, [[FP_SCRATCH_COPY]], 0 +; GCN: v_writelane_b32 v40, [[FP_SCRATCH_COPY]], 2 ; GCN: s_swappc_b64 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 9 ; GCN: buffer_store_dword [[K]], off, s[0:3], s33{{$}} -; GCN: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v41, 0 -; GCN: s_or_saveexec_b64 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GCN: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v40, 2 +; GCN: s_xor_saveexec_b64 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; GCN: s_mov_b64 exec ; GCN: s_mov_b32 s33, [[FP_SCRATCH_COPY]] ; GCN: s_setpc_b64 diff --git a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll --- a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll @@ -122,16 +122,16 @@ ; MUBUF-NEXT: ;;#ASMEND ; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: v_mov_b32_e32 v1, 0x1004 -; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], s32 offen ; 4-byte Folded Spill +; MUBUF-NEXT: s_add_i32 s10, s32, 0x40100 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s10 ; 4-byte Folded Spill ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ;;#ASMEND ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ;;#ASMEND ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ;;#ASMEND -; MUBUF-NEXT: v_mov_b32_e32 v1, 0x1004 -; MUBUF-NEXT: buffer_load_dword v0, v1, s[0:3], s32 offen ; 4-byte Folded Reload +; MUBUF-NEXT: s_add_i32 s10, s32, 0x40100 +; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s10 ; 4-byte Folded Reload ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ;;#ASMEND @@ -205,16 +205,16 @@ ; MUBUF-NEXT: ;;#ASMEND ; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: v_mov_b32_e32 v1, 0x1004 -; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; 4-byte Folded Spill +; MUBUF-NEXT: s_mov_b32 s10, 0x40100 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s10 ; 4-byte Folded Spill ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ;;#ASMEND ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ;;#ASMEND ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ;;#ASMEND -; MUBUF-NEXT: v_mov_b32_e32 v1, 0x1004 -; MUBUF-NEXT: buffer_load_dword v0, v1, s[0:3], 0 offen ; 4-byte Folded Reload +; MUBUF-NEXT: s_mov_b32 s10, 0x40100 +; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s10 ; 4-byte Folded Reload ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ;;#ASMEND @@ -660,5 +660,5 @@ attributes #0 = { nounwind } attributes #1 = { nounwind "amdgpu-num-sgpr"="17" "amdgpu-num-vgpr"="8" } -attributes #2 = { nounwind "amdgpu-num-sgpr"="14" "amdgpu-num-vgpr"="8" } -attributes #3 = { nounwind "amdgpu-num-sgpr"="16" "amdgpu-num-vgpr"="8" } +attributes #2 = { nounwind "amdgpu-num-sgpr"="16" "amdgpu-num-vgpr"="8" } +attributes #3 = { nounwind "amdgpu-num-sgpr"="18" "amdgpu-num-vgpr"="8" } diff --git a/llvm/test/CodeGen/AMDGPU/spill-reg-tuple-super-reg-use.mir b/llvm/test/CodeGen/AMDGPU/spill-reg-tuple-super-reg-use.mir --- a/llvm/test/CodeGen/AMDGPU/spill-reg-tuple-super-reg-use.mir +++ b/llvm/test/CodeGen/AMDGPU/spill-reg-tuple-super-reg-use.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=si-lower-sgpr-spills,prologepilog,machine-cp -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -start-before=si-lower-sgpr-spills -stop-after=prologepilog -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s # Make sure the initial first $sgpr1 = COPY $sgpr2 copy is not deleted # by the copy propagation after lowering the spill. @@ -26,11 +26,12 @@ ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr8_sgpr9 ; GCN-NEXT: renamable $sgpr1 = COPY $sgpr2 - ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr0, 0, $vgpr0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr1, 1, $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr2, 2, $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr3, 3, $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: renamable $sgpr8 = COPY killed renamable $sgpr1 + ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF + ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr0, 0, killed $vgpr0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr1, 1, killed $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr2, 2, killed $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-NEXT: dead renamable $vgpr0 = V_WRITELANE_B32 $sgpr3, 3, killed $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-NEXT: renamable $sgpr8 = COPY renamable $sgpr1 ; GCN-NEXT: $sgpr0_sgpr1 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1 @@ -63,10 +64,11 @@ ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr8_sgpr9 ; GCN-NEXT: renamable $sgpr1 = COPY $sgpr2 - ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr0, 0, $vgpr0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr1, 1, $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr2, 2, $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr3, 3, $vgpr0, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF + ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr0, 0, killed $vgpr0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr1, 1, killed $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr2, 2, killed $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-NEXT: dead renamable $vgpr0 = V_WRITELANE_B32 $sgpr3, 3, killed $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-NEXT: $sgpr0_sgpr1 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1 @@ -93,12 +95,12 @@ ; GCN-LABEL: name: spill_vgpr128_use_subreg ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = COPY $vgpr2 + ; GCN-NEXT: renamable $vgpr1 = COPY $vgpr2, implicit $exec ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0, addrspace 5) ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr1, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 4, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0 + 4, addrspace 5) ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 8, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0 + 8, addrspace 5) ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr3, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 12, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0 + 12, addrspace 5) - ; GCN-NEXT: renamable $vgpr8 = COPY killed renamable $vgpr1 + ; GCN-NEXT: renamable $vgpr8 = COPY $vgpr2, implicit $exec ; GCN-NEXT: S_ENDPGM 0, implicit $vgpr8 renamable $vgpr1 = COPY $vgpr2 SI_SPILL_V128_SAVE renamable $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, align 4, addrspace 5) @@ -123,11 +125,11 @@ ; GCN-LABEL: name: spill_vgpr128_use_kill ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = COPY $vgpr2 - ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0, addrspace 5) - ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 4, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0 + 4, addrspace 5) - ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 8, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0 + 8, addrspace 5) - ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 12, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0 + 12, addrspace 5) + ; GCN-NEXT: renamable $vgpr1 = COPY $vgpr2, implicit $exec + ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0, addrspace 5) + ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr1, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 4, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0 + 4, addrspace 5) + ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 8, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0 + 8, addrspace 5) + ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr3, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 12, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0 + 12, addrspace 5) ; GCN-NEXT: S_ENDPGM 0 renamable $vgpr1 = COPY $vgpr2 SI_SPILL_V128_SAVE renamable killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, align 4, addrspace 5) diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll --- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll @@ -10085,16 +10085,25 @@ ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, -1, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v5, -1, v0 ; GFX6-NEXT: v_mov_b32_e32 v6, 0 -; GFX6-NEXT: s_mov_b32 s38, 0 -; GFX6-NEXT: s_mov_b32 s39, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX6-NEXT: v_lshlrev_b32_e32 v7, 8, v5 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:240 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:240 ; GFX6-NEXT: s_addc_u32 s41, s41, 0 +; GFX6-NEXT: s_mov_b32 s2, 0x83800 +; GFX6-NEXT: s_mov_b64 s[34:35], exec +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:224 ; GFX6-NEXT: s_mov_b32 s2, 0x83400 -; GFX6-NEXT: s_mov_b64 s[44:45], exec ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -10102,7 +10111,7 @@ ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:224 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:208 ; GFX6-NEXT: s_mov_b32 s2, 0x83000 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill @@ -10111,7 +10120,7 @@ ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:208 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:192 ; GFX6-NEXT: s_mov_b32 s2, 0x82c00 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill @@ -10120,7 +10129,7 @@ ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:192 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:176 ; GFX6-NEXT: s_mov_b32 s2, 0x82800 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill @@ -10129,7 +10138,7 @@ ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:176 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:160 ; GFX6-NEXT: s_mov_b32 s2, 0x82400 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill @@ -10138,7 +10147,7 @@ ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:160 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:144 ; GFX6-NEXT: s_mov_b32 s2, 0x82000 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill @@ -10147,7 +10156,7 @@ ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:144 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:128 ; GFX6-NEXT: s_mov_b32 s2, 0x81c00 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill @@ -10156,7 +10165,7 @@ ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:128 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:112 ; GFX6-NEXT: s_mov_b32 s2, 0x81800 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill @@ -10165,7 +10174,7 @@ ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:112 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:96 ; GFX6-NEXT: s_mov_b32 s2, 0x81400 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill @@ -10174,7 +10183,7 @@ ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:96 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:80 ; GFX6-NEXT: s_mov_b32 s2, 0x81000 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill @@ -10183,17 +10192,8 @@ ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:80 -; GFX6-NEXT: s_mov_b32 s2, 0x80c00 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:64 -; GFX6-NEXT: s_mov_b32 s2, 0x80400 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:64 +; GFX6-NEXT: s_mov_b32 s2, 0x80800 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -10201,17 +10201,32 @@ ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 -; GFX6-NEXT: buffer_load_dwordx4 v[9:12], v[7:8], s[36:39], 0 addr64 offset:16 -; GFX6-NEXT: s_mov_b32 s2, 0x80800 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dwordx4 v[9:12], v[7:8], s[4:7], 0 addr64 offset:16 +; GFX6-NEXT: s_mov_b32 s2, 0x80c00 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v11, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v12, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_load_dwordx4 v[13:16], v[7:8], s[36:39], 0 addr64 offset:32 -; GFX6-NEXT: buffer_load_dwordx4 v[17:20], v[7:8], s[36:39], 0 addr64 offset:48 +; GFX6-NEXT: buffer_load_dwordx4 v[13:16], v[7:8], s[4:7], 0 addr64 offset:32 +; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] +; GFX6-NEXT: s_waitcnt expcnt(3) +; GFX6-NEXT: s_mov_b64 exec, 15 +; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], 0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_writelane_b32 v9, s0, 0 +; GFX6-NEXT: v_writelane_b32 v9, s1, 1 +; GFX6-NEXT: v_writelane_b32 v9, s2, 2 +; GFX6-NEXT: v_writelane_b32 v9, s3, 3 +; GFX6-NEXT: s_mov_b32 s8, 0x80400 +; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s8 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], 0 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: s_mov_b64 exec, s[34:35] +; GFX6-NEXT: buffer_load_dwordx4 v[17:20], v[7:8], s[4:7], 0 addr64 offset:48 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 13, v0 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 16, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, 1 @@ -10219,6 +10234,7 @@ ; GFX6-NEXT: ;;#ASMSTART ; GFX6-NEXT: ; def s[4:11] ; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_mov_b64 s[36:37], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -10230,12 +10246,12 @@ ; GFX6-NEXT: v_writelane_b32 v4, s9, 5 ; GFX6-NEXT: v_writelane_b32 v4, s10, 6 ; GFX6-NEXT: v_writelane_b32 v4, s11, 7 -; GFX6-NEXT: s_mov_b32 s2, 0x83800 +; GFX6-NEXT: s_mov_b32 s2, 0x83c00 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[44:45] +; GFX6-NEXT: s_mov_b64 exec, s[36:37] ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX6-NEXT: ;;#ASMSTART ; GFX6-NEXT: ; def s[8:15] @@ -10253,272 +10269,211 @@ ; GFX6-NEXT: ; def s[2:3] ; GFX6-NEXT: ;;#ASMEND ; GFX6-NEXT: ;;#ASMSTART -; GFX6-NEXT: ; def s[36:37] -; GFX6-NEXT: ;;#ASMEND -; GFX6-NEXT: ;;#ASMSTART ; GFX6-NEXT: ; def s33 ; GFX6-NEXT: ;;#ASMEND ; GFX6-NEXT: s_and_saveexec_b64 s[34:35], vcc ; GFX6-NEXT: s_cbranch_execz .LBB1_2 ; GFX6-NEXT: ; %bb.1: ; %bb0 -; GFX6-NEXT: s_mov_b64 s[44:45], exec -; GFX6-NEXT: s_mov_b64 exec, 0xff -; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], 0 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_writelane_b32 v9, s8, 0 -; GFX6-NEXT: v_writelane_b32 v9, s9, 1 -; GFX6-NEXT: v_writelane_b32 v9, s10, 2 -; GFX6-NEXT: v_writelane_b32 v9, s11, 3 -; GFX6-NEXT: v_writelane_b32 v9, s12, 4 -; GFX6-NEXT: v_writelane_b32 v9, s13, 5 -; GFX6-NEXT: v_writelane_b32 v9, s14, 6 -; GFX6-NEXT: v_writelane_b32 v9, s15, 7 -; GFX6-NEXT: v_mov_b32_e32 v4, 0x2100 -; GFX6-NEXT: buffer_store_dword v9, v4, s[40:43], 0 offen ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], 0 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[44:45] -; GFX6-NEXT: s_mov_b64 s[44:45], exec -; GFX6-NEXT: s_mov_b64 exec, 0xff -; GFX6-NEXT: v_mov_b32_e32 v4, 0x20e0 -; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], 0 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v8, v4, s[40:43], 0 offen ; 4-byte Folded Reload -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_readlane_b32 s8, v8, 0 -; GFX6-NEXT: v_readlane_b32 s9, v8, 1 -; GFX6-NEXT: v_readlane_b32 s10, v8, 2 -; GFX6-NEXT: v_readlane_b32 s11, v8, 3 -; GFX6-NEXT: v_readlane_b32 s12, v8, 4 -; GFX6-NEXT: v_readlane_b32 s13, v8, 5 -; GFX6-NEXT: v_readlane_b32 s14, v8, 6 -; GFX6-NEXT: v_readlane_b32 s15, v8, 7 -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], 0 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[44:45] -; GFX6-NEXT: s_mov_b64 s[44:45], exec +; GFX6-NEXT: s_mov_b64 s[38:39], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff ; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_writelane_b32 v7, s16, 0 -; GFX6-NEXT: v_writelane_b32 v7, s17, 1 -; GFX6-NEXT: v_writelane_b32 v7, s18, 2 -; GFX6-NEXT: v_writelane_b32 v7, s19, 3 -; GFX6-NEXT: v_writelane_b32 v7, s20, 4 -; GFX6-NEXT: v_writelane_b32 v7, s21, 5 -; GFX6-NEXT: v_writelane_b32 v7, s22, 6 -; GFX6-NEXT: v_writelane_b32 v7, s23, 7 -; GFX6-NEXT: v_mov_b32_e32 v4, 0x2120 -; GFX6-NEXT: buffer_store_dword v7, v4, s[40:43], 0 offen ; 4-byte Folded Spill +; GFX6-NEXT: v_writelane_b32 v7, s8, 0 +; GFX6-NEXT: v_writelane_b32 v7, s9, 1 +; GFX6-NEXT: v_writelane_b32 v7, s10, 2 +; GFX6-NEXT: v_writelane_b32 v7, s11, 3 +; GFX6-NEXT: v_writelane_b32 v7, s12, 4 +; GFX6-NEXT: v_writelane_b32 v7, s13, 5 +; GFX6-NEXT: v_writelane_b32 v7, s14, 6 +; GFX6-NEXT: v_writelane_b32 v7, s15, 7 +; GFX6-NEXT: s_mov_b32 s36, 0x84400 +; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s36 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[44:45] +; GFX6-NEXT: s_mov_b64 exec, s[38:39] ; GFX6-NEXT: s_mov_b64 s[44:45], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff -; GFX6-NEXT: v_mov_b32_e32 v4, 0x2100 -; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], 0 +; GFX6-NEXT: s_mov_b32 s36, 0x83c00 +; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, v4, s[40:43], 0 offen ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s36 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_readlane_b32 s16, v9, 0 -; GFX6-NEXT: v_readlane_b32 s17, v9, 1 -; GFX6-NEXT: v_readlane_b32 s18, v9, 2 -; GFX6-NEXT: v_readlane_b32 s19, v9, 3 -; GFX6-NEXT: v_readlane_b32 s20, v9, 4 -; GFX6-NEXT: v_readlane_b32 s21, v9, 5 -; GFX6-NEXT: v_readlane_b32 s22, v9, 6 -; GFX6-NEXT: v_readlane_b32 s23, v9, 7 -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], 0 +; GFX6-NEXT: v_readlane_b32 s8, v4, 0 +; GFX6-NEXT: v_readlane_b32 s9, v4, 1 +; GFX6-NEXT: v_readlane_b32 s10, v4, 2 +; GFX6-NEXT: v_readlane_b32 s11, v4, 3 +; GFX6-NEXT: v_readlane_b32 s12, v4, 4 +; GFX6-NEXT: v_readlane_b32 s13, v4, 5 +; GFX6-NEXT: v_readlane_b32 s14, v4, 6 +; GFX6-NEXT: v_readlane_b32 s15, v4, 7 +; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b64 exec, s[44:45] -; GFX6-NEXT: s_mov_b64 s[44:45], exec +; GFX6-NEXT: s_mov_b64 s[38:39], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff ; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_writelane_b32 v8, s24, 0 -; GFX6-NEXT: v_writelane_b32 v8, s25, 1 -; GFX6-NEXT: v_writelane_b32 v8, s26, 2 -; GFX6-NEXT: v_writelane_b32 v8, s27, 3 -; GFX6-NEXT: v_writelane_b32 v8, s28, 4 -; GFX6-NEXT: v_writelane_b32 v8, s29, 5 -; GFX6-NEXT: v_writelane_b32 v8, s30, 6 -; GFX6-NEXT: v_writelane_b32 v8, s31, 7 -; GFX6-NEXT: v_mov_b32_e32 v4, 0x2140 -; GFX6-NEXT: buffer_store_dword v8, v4, s[40:43], 0 offen ; 4-byte Folded Spill +; GFX6-NEXT: v_writelane_b32 v8, s16, 0 +; GFX6-NEXT: v_writelane_b32 v8, s17, 1 +; GFX6-NEXT: v_writelane_b32 v8, s18, 2 +; GFX6-NEXT: v_writelane_b32 v8, s19, 3 +; GFX6-NEXT: v_writelane_b32 v8, s20, 4 +; GFX6-NEXT: v_writelane_b32 v8, s21, 5 +; GFX6-NEXT: v_writelane_b32 v8, s22, 6 +; GFX6-NEXT: v_writelane_b32 v8, s23, 7 +; GFX6-NEXT: s_mov_b32 s36, 0x84c00 +; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s36 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[44:45] +; GFX6-NEXT: s_mov_b64 exec, s[38:39] ; GFX6-NEXT: s_mov_b64 s[44:45], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff -; GFX6-NEXT: v_mov_b32_e32 v4, 0x2120 +; GFX6-NEXT: s_mov_b32 s36, 0x84400 ; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v7, v4, s[40:43], 0 offen ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s36 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_readlane_b32 s24, v7, 0 -; GFX6-NEXT: v_readlane_b32 s25, v7, 1 -; GFX6-NEXT: v_readlane_b32 s26, v7, 2 -; GFX6-NEXT: v_readlane_b32 s27, v7, 3 -; GFX6-NEXT: v_readlane_b32 s28, v7, 4 -; GFX6-NEXT: v_readlane_b32 s29, v7, 5 -; GFX6-NEXT: v_readlane_b32 s30, v7, 6 -; GFX6-NEXT: v_readlane_b32 s31, v7, 7 +; GFX6-NEXT: v_readlane_b32 s16, v7, 0 +; GFX6-NEXT: v_readlane_b32 s17, v7, 1 +; GFX6-NEXT: v_readlane_b32 s18, v7, 2 +; GFX6-NEXT: v_readlane_b32 s19, v7, 3 +; GFX6-NEXT: v_readlane_b32 s20, v7, 4 +; GFX6-NEXT: v_readlane_b32 s21, v7, 5 +; GFX6-NEXT: v_readlane_b32 s22, v7, 6 +; GFX6-NEXT: v_readlane_b32 s23, v7, 7 ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b64 exec, s[44:45] -; GFX6-NEXT: s_mov_b64 s[44:45], exec -; GFX6-NEXT: s_mov_b64 exec, 15 -; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], 0 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_writelane_b32 v10, s0, 0 -; GFX6-NEXT: v_writelane_b32 v10, s1, 1 -; GFX6-NEXT: v_writelane_b32 v10, s2, 2 -; GFX6-NEXT: v_writelane_b32 v10, s3, 3 -; GFX6-NEXT: v_mov_b32_e32 v4, 0x2160 -; GFX6-NEXT: buffer_store_dword v10, v4, s[40:43], 0 offen ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], 0 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[44:45] -; GFX6-NEXT: s_mov_b64 s[44:45], exec -; GFX6-NEXT: s_mov_b64 exec, 15 -; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], 0 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_writelane_b32 v8, s4, 0 -; GFX6-NEXT: v_writelane_b32 v8, s5, 1 -; GFX6-NEXT: v_writelane_b32 v8, s6, 2 -; GFX6-NEXT: v_writelane_b32 v8, s7, 3 -; GFX6-NEXT: s_mov_b32 s0, 0x85c00 -; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s0 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], 0 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[44:45] -; GFX6-NEXT: s_mov_b64 s[0:1], exec -; GFX6-NEXT: s_mov_b64 exec, 3 -; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], 0 +; GFX6-NEXT: s_mov_b64 s[38:39], exec +; GFX6-NEXT: s_mov_b64 exec, 0xff +; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_writelane_b32 v9, s2, 0 -; GFX6-NEXT: v_writelane_b32 v9, s3, 1 -; GFX6-NEXT: s_mov_b32 s4, 0x86600 -; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s4 ; 4-byte Folded Spill +; GFX6-NEXT: v_writelane_b32 v4, s24, 0 +; GFX6-NEXT: v_writelane_b32 v4, s25, 1 +; GFX6-NEXT: v_writelane_b32 v4, s26, 2 +; GFX6-NEXT: v_writelane_b32 v4, s27, 3 +; GFX6-NEXT: v_writelane_b32 v4, s28, 4 +; GFX6-NEXT: v_writelane_b32 v4, s29, 5 +; GFX6-NEXT: v_writelane_b32 v4, s30, 6 +; GFX6-NEXT: v_writelane_b32 v4, s31, 7 +; GFX6-NEXT: s_mov_b32 s36, 0x85400 +; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s36 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], 0 +; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[0:1] +; GFX6-NEXT: s_mov_b64 exec, s[38:39] ; GFX6-NEXT: s_mov_b64 s[44:45], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff -; GFX6-NEXT: v_mov_b32_e32 v4, 0x2140 -; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], 0 +; GFX6-NEXT: s_mov_b32 s36, 0x84c00 +; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v7, v4, s[40:43], 0 offen ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s36 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_readlane_b32 s0, v7, 0 -; GFX6-NEXT: v_readlane_b32 s1, v7, 1 -; GFX6-NEXT: v_readlane_b32 s2, v7, 2 -; GFX6-NEXT: v_readlane_b32 s3, v7, 3 -; GFX6-NEXT: v_readlane_b32 s4, v7, 4 -; GFX6-NEXT: v_readlane_b32 s5, v7, 5 -; GFX6-NEXT: v_readlane_b32 s6, v7, 6 -; GFX6-NEXT: v_readlane_b32 s7, v7, 7 -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], 0 +; GFX6-NEXT: v_readlane_b32 s24, v9, 0 +; GFX6-NEXT: v_readlane_b32 s25, v9, 1 +; GFX6-NEXT: v_readlane_b32 s26, v9, 2 +; GFX6-NEXT: v_readlane_b32 s27, v9, 3 +; GFX6-NEXT: v_readlane_b32 s28, v9, 4 +; GFX6-NEXT: v_readlane_b32 s29, v9, 5 +; GFX6-NEXT: v_readlane_b32 s30, v9, 6 +; GFX6-NEXT: v_readlane_b32 s31, v9, 7 +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b64 exec, s[44:45] -; GFX6-NEXT: s_mov_b64 s[44:45], exec +; GFX6-NEXT: s_mov_b64 s[36:37], exec ; GFX6-NEXT: s_mov_b64 exec, 15 ; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_writelane_b32 v8, s36, 0 -; GFX6-NEXT: v_writelane_b32 v8, s37, 1 -; GFX6-NEXT: v_writelane_b32 v8, s38, 2 -; GFX6-NEXT: v_writelane_b32 v8, s39, 3 -; GFX6-NEXT: v_mov_b32_e32 v4, 0x2180 -; GFX6-NEXT: buffer_store_dword v8, v4, s[40:43], 0 offen ; 4-byte Folded Spill +; GFX6-NEXT: v_writelane_b32 v8, s0, 0 +; GFX6-NEXT: v_writelane_b32 v8, s1, 1 +; GFX6-NEXT: v_writelane_b32 v8, s2, 2 +; GFX6-NEXT: v_writelane_b32 v8, s3, 3 +; GFX6-NEXT: s_mov_b32 s38, 0x85c00 +; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s38 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[44:45] +; GFX6-NEXT: s_mov_b64 exec, s[36:37] ; GFX6-NEXT: s_mov_b64 s[38:39], exec -; GFX6-NEXT: s_mov_b64 exec, 3 -; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], 0 +; GFX6-NEXT: s_mov_b64 exec, 15 +; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_writelane_b32 v10, s36, 0 -; GFX6-NEXT: v_writelane_b32 v10, s37, 1 -; GFX6-NEXT: s_mov_b32 s44, 0x86400 -; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s44 ; 4-byte Folded Spill +; GFX6-NEXT: v_writelane_b32 v4, s4, 0 +; GFX6-NEXT: v_writelane_b32 v4, s5, 1 +; GFX6-NEXT: v_writelane_b32 v4, s6, 2 +; GFX6-NEXT: v_writelane_b32 v4, s7, 3 +; GFX6-NEXT: s_mov_b32 s0, 0x86000 +; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s0 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], 0 +; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b64 exec, s[38:39] ; GFX6-NEXT: s_mov_b64 s[44:45], exec -; GFX6-NEXT: s_mov_b64 exec, 15 -; GFX6-NEXT: v_mov_b32_e32 v4, 0x2170 -; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], 0 +; GFX6-NEXT: s_mov_b64 exec, 3 +; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v9, v4, s[40:43], 0 offen ; 4-byte Folded Reload -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_readlane_b32 s36, v9, 0 -; GFX6-NEXT: v_readlane_b32 s37, v9, 1 -; GFX6-NEXT: v_readlane_b32 s38, v9, 2 -; GFX6-NEXT: v_readlane_b32 s39, v9, 3 -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], 0 +; GFX6-NEXT: v_writelane_b32 v7, s2, 0 +; GFX6-NEXT: v_writelane_b32 v7, s3, 1 +; GFX6-NEXT: s_mov_b32 s0, 0x86400 +; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s0 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b64 exec, s[44:45] -; GFX6-NEXT: s_not_b64 exec, exec -; GFX6-NEXT: v_mov_b32_e32 v4, 0x2190 -; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], 0 +; GFX6-NEXT: s_mov_b64 s[36:37], exec +; GFX6-NEXT: s_mov_b64 exec, 0xff +; GFX6-NEXT: s_mov_b32 s38, 0x85400 +; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v7, v4, s[40:43], 0 offen ; 4-byte Folded Reload -; GFX6-NEXT: s_not_b64 exec, exec -; GFX6-NEXT: v_mov_b32_e32 v4, 0x2190 -; GFX6-NEXT: buffer_load_dword v7, v4, s[40:43], 0 offen ; 4-byte Folded Reload -; GFX6-NEXT: s_not_b64 exec, exec +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s38 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_readlane_b32 s44, v7, 0 -; GFX6-NEXT: v_readlane_b32 s45, v7, 1 -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], 0 +; GFX6-NEXT: v_readlane_b32 s0, v9, 0 +; GFX6-NEXT: v_readlane_b32 s1, v9, 1 +; GFX6-NEXT: v_readlane_b32 s2, v9, 2 +; GFX6-NEXT: v_readlane_b32 s3, v9, 3 +; GFX6-NEXT: v_readlane_b32 s4, v9, 4 +; GFX6-NEXT: v_readlane_b32 s5, v9, 5 +; GFX6-NEXT: v_readlane_b32 s6, v9, 6 +; GFX6-NEXT: v_readlane_b32 s7, v9, 7 +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_not_b64 exec, exec -; GFX6-NEXT: s_mov_b64 vcc, s[34:35] -; GFX6-NEXT: s_not_b64 exec, exec -; GFX6-NEXT: v_mov_b32_e32 v4, 0x2198 +; GFX6-NEXT: s_mov_b64 exec, s[36:37] +; GFX6-NEXT: s_mov_b64 s[44:45], exec +; GFX6-NEXT: s_mov_b64 exec, 15 +; GFX6-NEXT: v_mov_b32_e32 v4, 0x2180 ; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v8, v4, s[40:43], 0 offen ; 4-byte Folded Reload -; GFX6-NEXT: s_not_b64 exec, exec -; GFX6-NEXT: v_mov_b32_e32 v4, 0x2198 -; GFX6-NEXT: buffer_load_dword v8, v4, s[40:43], 0 offen ; 4-byte Folded Reload -; GFX6-NEXT: s_not_b64 exec, exec ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_readlane_b32 s34, v8, 0 -; GFX6-NEXT: v_readlane_b32 s35, v8, 1 +; GFX6-NEXT: v_readlane_b32 s36, v8, 0 +; GFX6-NEXT: v_readlane_b32 s37, v8, 1 +; GFX6-NEXT: v_readlane_b32 s38, v8, 2 +; GFX6-NEXT: v_readlane_b32 s39, v8, 3 ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_not_b64 exec, exec -; GFX6-NEXT: ;;#ASMSTART -; GFX6-NEXT: ; use s[8:15],s[16:23],s[24:31],s[0:7],s[36:39],s[34:35],s[44:45] -; GFX6-NEXT: ;;#ASMEND -; GFX6-NEXT: s_mov_b64 s[34:35], vcc -; GFX6-NEXT: s_mov_b64 s[8:9], exec -; GFX6-NEXT: s_mov_b64 exec, 15 -; GFX6-NEXT: s_mov_b32 s0, 0x86000 +; GFX6-NEXT: s_mov_b64 exec, s[44:45] +; GFX6-NEXT: s_mov_b64 vcc, s[34:35] +; GFX6-NEXT: s_mov_b64 s[44:45], exec +; GFX6-NEXT: s_mov_b64 exec, 3 +; GFX6-NEXT: v_mov_b32_e32 v7, 0x2190 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s0 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_readlane_b32 s36, v4, 0 -; GFX6-NEXT: v_readlane_b32 s37, v4, 1 -; GFX6-NEXT: v_readlane_b32 s38, v4, 2 -; GFX6-NEXT: v_readlane_b32 s39, v4, 3 +; GFX6-NEXT: v_readlane_b32 s34, v4, 0 +; GFX6-NEXT: v_readlane_b32 s35, v4, 1 ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[8:9] +; GFX6-NEXT: s_mov_b64 exec, s[44:45] +; GFX6-NEXT: ;;#ASMSTART +; GFX6-NEXT: ; use s[8:15],s[16:23],s[24:31],s[0:7],s[36:39],s[34:35] +; GFX6-NEXT: ;;#ASMEND +; GFX6-NEXT: s_mov_b64 s[34:35], vcc ; GFX6-NEXT: s_mov_b64 s[4:5], exec ; GFX6-NEXT: s_mov_b64 exec, 15 -; GFX6-NEXT: s_mov_b32 s6, 0x85800 +; GFX6-NEXT: s_mov_b32 s6, 0x85c00 ; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s6 ; 4-byte Folded Reload @@ -10530,19 +10485,19 @@ ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b64 exec, s[4:5] -; GFX6-NEXT: s_mov_b32 s2, 0x83800 +; GFX6-NEXT: s_mov_b32 s2, 0x83c00 ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill -; GFX6-NEXT: s_mov_b32 s2, 0x84000 +; GFX6-NEXT: s_mov_b32 s2, 0x84400 ; GFX6-NEXT: buffer_store_dword v13, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v14, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v15, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v16, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill -; GFX6-NEXT: s_mov_b32 s2, 0x84800 +; GFX6-NEXT: s_mov_b32 s2, 0x84c00 ; GFX6-NEXT: buffer_store_dword v17, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v18, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill @@ -10551,17 +10506,17 @@ ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: ;;#ASMSTART ; GFX6-NEXT: ;;#ASMEND -; GFX6-NEXT: s_mov_b32 s2, 0x84800 +; GFX6-NEXT: s_mov_b32 s2, 0x84c00 ; GFX6-NEXT: buffer_load_dword v17, off, s[40:43], s2 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v18, off, s[40:43], s2 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v19, off, s[40:43], s2 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v20, off, s[40:43], s2 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s2, 0x84000 +; GFX6-NEXT: s_mov_b32 s2, 0x84400 ; GFX6-NEXT: buffer_load_dword v13, off, s[40:43], s2 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v14, off, s[40:43], s2 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v15, off, s[40:43], s2 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v16, off, s[40:43], s2 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s2, 0x83800 +; GFX6-NEXT: s_mov_b32 s2, 0x83c00 ; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s2 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Reload @@ -10580,14 +10535,28 @@ ; GFX6-NEXT: ;;#ASMEND ; GFX6-NEXT: .LBB1_2: ; %ret ; GFX6-NEXT: s_or_b64 exec, exec, s[34:35] -; GFX6-NEXT: s_mov_b32 s4, 0x83400 +; GFX6-NEXT: s_mov_b64 s[8:9], exec +; GFX6-NEXT: s_mov_b64 exec, 15 +; GFX6-NEXT: s_mov_b32 s2, 0x80400 +; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], 0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s2 ; 4-byte Folded Reload +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_readlane_b32 s4, v10, 0 +; GFX6-NEXT: v_readlane_b32 s5, v10, 1 +; GFX6-NEXT: v_readlane_b32 s6, v10, 2 +; GFX6-NEXT: v_readlane_b32 s7, v10, 3 +; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], 0 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: s_mov_b64 exec, s[8:9] +; GFX6-NEXT: s_mov_b32 s4, 0x83800 ; GFX6-NEXT: v_lshl_b64 v[4:5], v[5:6], 8 ; GFX6-NEXT: buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX6-NEXT: s_mov_b32 s4, 0x83000 +; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] +; GFX6-NEXT: s_mov_b32 s4, 0x83400 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:240 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -10595,7 +10564,7 @@ ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x82c00 +; GFX6-NEXT: s_mov_b32 s4, 0x83000 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:224 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -10603,7 +10572,7 @@ ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x82800 +; GFX6-NEXT: s_mov_b32 s4, 0x82c00 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:208 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -10611,7 +10580,7 @@ ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x82400 +; GFX6-NEXT: s_mov_b32 s4, 0x82800 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:192 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -10619,7 +10588,7 @@ ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x82000 +; GFX6-NEXT: s_mov_b32 s4, 0x82400 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:176 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -10627,7 +10596,7 @@ ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x81c00 +; GFX6-NEXT: s_mov_b32 s4, 0x82000 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:160 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -10635,7 +10604,7 @@ ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x81800 +; GFX6-NEXT: s_mov_b32 s4, 0x81c00 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:144 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -10643,7 +10612,7 @@ ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x81400 +; GFX6-NEXT: s_mov_b32 s4, 0x81800 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:128 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -10651,7 +10620,7 @@ ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x81000 +; GFX6-NEXT: s_mov_b32 s4, 0x81400 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:112 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -10659,7 +10628,7 @@ ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x80c00 +; GFX6-NEXT: s_mov_b32 s4, 0x81000 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:96 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -10667,7 +10636,7 @@ ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x80400 +; GFX6-NEXT: s_mov_b32 s4, 0x80800 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:80 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -10675,7 +10644,7 @@ ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x80800 +; GFX6-NEXT: s_mov_b32 s4, 0x80c00 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:64 ; GFX6-NEXT: buffer_store_dwordx4 v[17:20], v[4:5], s[0:3], 0 addr64 offset:48 @@ -10777,16 +10746,13 @@ ; GFX9-FLATSCR-NEXT: ; def s[38:39] ; GFX9-FLATSCR-NEXT: ;;#ASMEND ; GFX9-FLATSCR-NEXT: ;;#ASMSTART -; GFX9-FLATSCR-NEXT: ; def s[44:45] -; GFX9-FLATSCR-NEXT: ;;#ASMEND -; GFX9-FLATSCR-NEXT: ;;#ASMSTART ; GFX9-FLATSCR-NEXT: ; def s33 ; GFX9-FLATSCR-NEXT: ;;#ASMEND ; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[34:35], vcc ; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB1_2 ; GFX9-FLATSCR-NEXT: ; %bb.1: ; %bb0 ; GFX9-FLATSCR-NEXT: ;;#ASMSTART -; GFX9-FLATSCR-NEXT: ; use s[0:7],s[8:15],s[16:23],s[24:31],s[40:43],s[38:39],s[44:45] +; GFX9-FLATSCR-NEXT: ; use s[0:7],s[8:15],s[16:23],s[24:31],s[40:43],s[38:39] ; GFX9-FLATSCR-NEXT: ;;#ASMEND ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20d0 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill @@ -10931,16 +10897,13 @@ ; GFX10-FLATSCR-NEXT: ; def s[34:35] ; GFX10-FLATSCR-NEXT: ;;#ASMEND ; GFX10-FLATSCR-NEXT: ;;#ASMSTART -; GFX10-FLATSCR-NEXT: ; def s[38:39] -; GFX10-FLATSCR-NEXT: ;;#ASMEND -; GFX10-FLATSCR-NEXT: ;;#ASMSTART -; GFX10-FLATSCR-NEXT: ; def s44 +; GFX10-FLATSCR-NEXT: ; def s38 ; GFX10-FLATSCR-NEXT: ;;#ASMEND ; GFX10-FLATSCR-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX10-FLATSCR-NEXT: s_cbranch_execz .LBB1_2 ; GFX10-FLATSCR-NEXT: ; %bb.1: ; %bb0 ; GFX10-FLATSCR-NEXT: ;;#ASMSTART -; GFX10-FLATSCR-NEXT: ; use s[0:7],s[8:15],s[16:23],s[24:31],s[40:43],s[34:35],s[38:39] +; GFX10-FLATSCR-NEXT: ; use s[0:7],s[8:15],s[16:23],s[24:31],s[40:43],s[34:35] ; GFX10-FLATSCR-NEXT: ;;#ASMEND ; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x2010 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v88, v59 @@ -11118,15 +11081,14 @@ %sgpr3 = call <8 x i32> asm sideeffect "; def $0", "=s" () %sgpr4 = call <4 x i32> asm sideeffect "; def $0", "=s" () %sgpr5 = call <2 x i32> asm sideeffect "; def $0", "=s" () - %sgpr6 = call <2 x i32> asm sideeffect "; def $0", "=s" () - %sgpr7 = call i32 asm sideeffect "; def $0", "=s" () + %sgpr6 = call i32 asm sideeffect "; def $0", "=s" () %cmp = icmp eq i32 %x, 0 br i1 %cmp, label %bb0, label %ret bb0: ; create SGPR pressure - call void asm sideeffect "; use $0,$1,$2,$3,$4,$5,$6", "s,s,s,s,s,s,s,s"(<8 x i32> %sgpr0, <8 x i32> %sgpr1, <8 x i32> %sgpr2, <8 x i32> %sgpr3, <4 x i32> %sgpr4, <2 x i32> %sgpr5, <2 x i32> %sgpr6, i32 %sgpr7) + call void asm sideeffect "; use $0,$1,$2,$3,$4,$5", "s,s,s,s,s,s,s"(<8 x i32> %sgpr0, <8 x i32> %sgpr1, <8 x i32> %sgpr2, <8 x i32> %sgpr3, <4 x i32> %sgpr4, <2 x i32> %sgpr5, i32 %sgpr6) ; mark most VGPR registers as used to increase register pressure call void asm sideeffect "", "~{v4},~{v8},~{v12},~{v16},~{v20},~{v24},~{v28},~{v32}" () diff --git a/llvm/test/CodeGen/AMDGPU/spill-sgpr-csr-live-ins.mir b/llvm/test/CodeGen/AMDGPU/spill-sgpr-csr-live-ins.mir --- a/llvm/test/CodeGen/AMDGPU/spill-sgpr-csr-live-ins.mir +++ b/llvm/test/CodeGen/AMDGPU/spill-sgpr-csr-live-ins.mir @@ -10,9 +10,10 @@ bb.0: liveins: $sgpr50 ; CHECK-LABEL: name: spill_csr_sgpr_argument - ; CHECK: liveins: $sgpr50, $vgpr0 + ; CHECK: liveins: $sgpr50 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr50, 0, $vgpr0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr50, 0, [[V_WRITELANE_B32_]] ; CHECK-NEXT: S_NOP 0, implicit $sgpr50 ; CHECK-NEXT: $sgpr50 = S_MOV_B32 0 S_NOP 0, implicit $sgpr50 diff --git a/llvm/test/CodeGen/AMDGPU/spill-sgpr-stack-no-sgpr.ll b/llvm/test/CodeGen/AMDGPU/spill-sgpr-stack-no-sgpr.ll --- a/llvm/test/CodeGen/AMDGPU/spill-sgpr-stack-no-sgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-sgpr-stack-no-sgpr.ll @@ -1,59 +1,50 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s -; Spill an SGPR to scratch without having spare SGPRs available to save exec +; The test was originally written to spill an SGPR to scratch without having spare SGPRs available to save exec. +; This scenario no longer exists when we enabled SGPR spill into virtual VGPRs. define amdgpu_kernel void @test() #1 { ; GFX10-LABEL: test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GFX10-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GFX10-NEXT: s_mov_b32 s10, -1 -; GFX10-NEXT: s_mov_b32 s11, 0x31e16000 -; GFX10-NEXT: s_add_u32 s8, s8, s1 -; GFX10-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX10-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX10-NEXT: s_mov_b32 s14, -1 +; GFX10-NEXT: s_mov_b32 s15, 0x31e16000 +; GFX10-NEXT: s_add_u32 s12, s12, s1 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s[0:7] ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s[8:12] ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_not_b64 exec, exec -; GFX10-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX10-NOT: s_not_b64 exec, exec +; GFX10-NEXT: ; implicit-def: $vgpr0 ; GFX10-NEXT: v_writelane_b32 v0, s8, 0 ; GFX10-NEXT: v_writelane_b32 v0, s9, 1 ; GFX10-NEXT: v_writelane_b32 v0, s10, 2 ; GFX10-NEXT: v_writelane_b32 v0, s11, 3 ; GFX10-NEXT: v_writelane_b32 v0, s12, 4 -; GFX10-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GFX10-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_not_b64 exec, exec -; GFX10-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_not_b64 exec, exec -; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_not_b64 exec, exec +; GFX10-NEXT: s_mov_b64 exec, s[14:15] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use s[0:7] ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_mov_b64 s[6:7], exec -; GFX10-NEXT: s_mov_b64 exec, 31 -; GFX10-NEXT: buffer_store_dword v0, off, s[8:11], 0 -; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload +; GFX10-NEXT: s_or_saveexec_b64 s[14:15], -1 +; GFX10-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b64 exec, s[14:15] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_readlane_b32 s0, v0, 0 ; GFX10-NEXT: v_readlane_b32 s1, v0, 1 ; GFX10-NEXT: v_readlane_b32 s2, v0, 2 ; GFX10-NEXT: v_readlane_b32 s3, v0, 3 ; GFX10-NEXT: v_readlane_b32 s4, v0, 4 -; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b64 exec, s[6:7] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use s[0:4] ; GFX10-NEXT: ;;#ASMEND @@ -67,4 +58,4 @@ } attributes #0 = { nounwind } -attributes #1 = { nounwind "amdgpu-num-sgpr"="16" "amdgpu-num-vgpr"="8" } +attributes #1 = { nounwind "amdgpu-num-sgpr"="18" "amdgpu-num-vgpr"="8" } diff --git a/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir b/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir @@ -0,0 +1,320 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -run-pass=si-lower-sgpr-spills -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s + +# A simple SGPR spill. Implicit def for lane VGPR should be inserted just before the spill instruction. +--- +name: sgpr32_spill +tracksRegLiveness: true +frameInfo: + maxAlignment: 4 +stack: + - { id: 0, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill } +machineFunctionInfo: + isEntryFunction: false + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + stackPtrOffsetReg: '$sgpr32' + frameOffsetReg: '$sgpr33' + hasSpilledSGPRs: true +body: | + bb.0: + liveins: $sgpr30_sgpr31, $sgpr10 + ; GCN-LABEL: name: sgpr32_spill + ; GCN: liveins: $sgpr30_sgpr31, $sgpr10 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[V_WRITELANE_B32_]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr10, 0, [[V_WRITELANE_B32_]] + ; GCN-NEXT: $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_]], 0 + ; GCN-NEXT: S_SETPC_B64 $sgpr30_sgpr31 + S_NOP 0 + SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 + renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 + S_SETPC_B64 $sgpr30_sgpr31 +... + +# Needed an additional virtual lane register as the lanes of current register are fully occupied while spilling a wide SGPR tuple. +# There must be two implicit def for the two lane VGPRs. + +--- +name: sgpr_spill_lane_crossover +tracksRegLiveness: true +frameInfo: + maxAlignment: 4 +stack: + - { id: 0, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill } + - { id: 1, type: spill-slot, size: 128, alignment: 4, stack-id: sgpr-spill } +machineFunctionInfo: + isEntryFunction: false + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + stackPtrOffsetReg: '$sgpr32' + frameOffsetReg: '$sgpr33' + hasSpilledSGPRs: true +body: | + bb.0: + liveins: $sgpr30_sgpr31, $sgpr10, $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, $sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79, $sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87, $sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-LABEL: name: sgpr_spill_lane_crossover + ; GCN: liveins: $sgpr10, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, $sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79, $sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87, $sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, $sgpr30_sgpr31 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr64, 0, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr65, 1, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr66, 2, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr67, 3, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr68, 4, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr69, 5, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr70, 6, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr71, 7, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr72, 8, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr73, 9, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr74, 10, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr75, 11, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr76, 12, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr77, 13, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr78, 14, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr79, 15, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr80, 16, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr81, 17, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr82, 18, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr83, 19, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr84, 20, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr85, 21, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr86, 22, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr87, 23, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr88, 24, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr89, 25, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr90, 26, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr91, 27, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr92, 28, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr93, 29, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr94, 30, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr95, 31, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr10, 32, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: [[V_WRITELANE_B32_2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr64, 33, [[V_WRITELANE_B32_1]], implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr65, 34, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr66, 35, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr67, 36, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr68, 37, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr69, 38, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr70, 39, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr71, 40, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr72, 41, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr73, 42, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr74, 43, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr75, 44, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr76, 45, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr77, 46, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr78, 47, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr79, 48, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr80, 49, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr81, 50, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr82, 51, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr83, 52, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr84, 53, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr85, 54, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr86, 55, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr87, 56, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr88, 57, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr89, 58, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr90, 59, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr91, 60, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr92, 61, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr93, 62, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr94, 63, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: [[V_WRITELANE_B32_2]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr95, 0, [[V_WRITELANE_B32_2]], implicit killed $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: $sgpr64 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 33, implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 + ; GCN-NEXT: $sgpr65 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 34 + ; GCN-NEXT: $sgpr66 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 35 + ; GCN-NEXT: $sgpr67 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 36 + ; GCN-NEXT: $sgpr68 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 37 + ; GCN-NEXT: $sgpr69 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 38 + ; GCN-NEXT: $sgpr70 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 39 + ; GCN-NEXT: $sgpr71 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 40 + ; GCN-NEXT: $sgpr72 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 41 + ; GCN-NEXT: $sgpr73 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 42 + ; GCN-NEXT: $sgpr74 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 43 + ; GCN-NEXT: $sgpr75 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 44 + ; GCN-NEXT: $sgpr76 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 45 + ; GCN-NEXT: $sgpr77 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 46 + ; GCN-NEXT: $sgpr78 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 47 + ; GCN-NEXT: $sgpr79 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 48 + ; GCN-NEXT: $sgpr80 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 49 + ; GCN-NEXT: $sgpr81 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 50 + ; GCN-NEXT: $sgpr82 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 51 + ; GCN-NEXT: $sgpr83 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 52 + ; GCN-NEXT: $sgpr84 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 53 + ; GCN-NEXT: $sgpr85 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 54 + ; GCN-NEXT: $sgpr86 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 55 + ; GCN-NEXT: $sgpr87 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 56 + ; GCN-NEXT: $sgpr88 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 57 + ; GCN-NEXT: $sgpr89 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 58 + ; GCN-NEXT: $sgpr90 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 59 + ; GCN-NEXT: $sgpr91 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 60 + ; GCN-NEXT: $sgpr92 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 61 + ; GCN-NEXT: $sgpr93 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 62 + ; GCN-NEXT: $sgpr94 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 63 + ; GCN-NEXT: $sgpr95 = V_READLANE_B32 [[V_WRITELANE_B32_2]], 0 + ; GCN-NEXT: $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 32 + ; GCN-NEXT: S_SETPC_B64 $sgpr30_sgpr31 + S_NOP 0 + SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 + SI_SPILL_S1024_SAVE killed $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 + S_NOP 0 + renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 + renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 + S_SETPC_B64 $sgpr30_sgpr31 +... + +# The implicit def for the lane VGPR should be inserted at the common dominator block (the entry block here). + +--- +name: lane_vgpr_implicit_def_at_common_dominator_block +tracksRegLiveness: true +frameInfo: + maxAlignment: 4 +stack: + - { id: 0, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill } +machineFunctionInfo: + isEntryFunction: false + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + stackPtrOffsetReg: '$sgpr32' + frameOffsetReg: '$sgpr33' + hasSpilledSGPRs: true +body: | + ; GCN-LABEL: name: lane_vgpr_implicit_def_at_common_dominator_block + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; GCN-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: S_CMP_EQ_U32 $sgpr11, 0, implicit-def $scc + ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit killed $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.3(0x80000000) + ; GCN-NEXT: liveins: $sgpr10, $sgpr30_sgpr31 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr10 = S_MOV_B32 10 + ; GCN-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr10, 0, [[V_WRITELANE_B32_]] + ; GCN-NEXT: S_BRANCH %bb.3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: successors: %bb.3(0x80000000) + ; GCN-NEXT: liveins: $sgpr10, $sgpr30_sgpr31 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr10 = S_MOV_B32 20 + ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr10, 0, [[V_WRITELANE_B32_1]] + ; GCN-NEXT: S_BRANCH %bb.3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.3: + ; GCN-NEXT: liveins: $sgpr10, $sgpr30_sgpr31 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 0 + ; GCN-NEXT: S_SETPC_B64 $sgpr30_sgpr31, implicit $sgpr10 + bb.0: + liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31 + S_NOP 0 + S_CMP_EQ_U32 $sgpr11, 0, implicit-def $scc + S_CBRANCH_SCC1 %bb.2, implicit killed $scc + bb.1: + liveins: $sgpr10, $sgpr30_sgpr31 + $sgpr10 = S_MOV_B32 10 + SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 + S_BRANCH %bb.3 + bb.2: + liveins: $sgpr10, $sgpr30_sgpr31 + $sgpr10 = S_MOV_B32 20 + SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 + S_BRANCH %bb.3 + bb.3: + liveins: $sgpr10, $sgpr30_sgpr31 + renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 + S_SETPC_B64 $sgpr30_sgpr31, implicit $sgpr10 +... + +# The common dominator block is visited only at the end. The insertion point was initially identified to the +# terminator instruction in the dominator block which later becomes the point where a spill get inserted in the same block. + +--- +name: dominator_block_follows_the_successors_bbs +tracksRegLiveness: true +frameInfo: + maxAlignment: 4 +stack: + - { id: 0, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill } +machineFunctionInfo: + isEntryFunction: false + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + stackPtrOffsetReg: '$sgpr32' + frameOffsetReg: '$sgpr33' + hasSpilledSGPRs: true +body: | + ; GCN-LABEL: name: dominator_block_follows_the_successors_bbs + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.3(0x80000000) + ; GCN-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: S_BRANCH %bb.3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: liveins: $sgpr10, $sgpr30_sgpr31 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr10 = V_READLANE_B32 %0, 0 + ; GCN-NEXT: $sgpr10 = S_ADD_I32 $sgpr10, 15, implicit-def dead $scc + ; GCN-NEXT: S_BRANCH %bb.2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: successors: %bb.3(0x80000000) + ; GCN-NEXT: liveins: $sgpr10, $sgpr30_sgpr31 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr10 = V_READLANE_B32 %0, 0 + ; GCN-NEXT: $sgpr10 = S_ADD_I32 $sgpr10, 20, implicit-def dead $scc + ; GCN-NEXT: S_BRANCH %bb.3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.3: + ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; GCN-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr10 = S_MOV_B32 10 + ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr10, 0, [[V_WRITELANE_B32_]] + ; GCN-NEXT: S_CMP_EQ_U32 $sgpr11, 0, implicit-def $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit killed $scc + ; GCN-NEXT: S_BRANCH %bb.1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.4: + ; GCN-NEXT: liveins: $sgpr10, $sgpr30_sgpr31 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: S_SETPC_B64 $sgpr30_sgpr31, implicit $sgpr10 + bb.0: + liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31 + S_NOP 0 + S_BRANCH %bb.3 + bb.1: + liveins: $sgpr10, $sgpr30_sgpr31 + renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 + $sgpr10 = S_ADD_I32 $sgpr10, 15, implicit-def dead $scc + S_BRANCH %bb.2 + bb.2: + liveins: $sgpr10, $sgpr30_sgpr31 + renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 + $sgpr10 = S_ADD_I32 $sgpr10, 20, implicit-def dead $scc + S_BRANCH %bb.3 + bb.3: + liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31 + $sgpr10 = S_MOV_B32 10 + SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 + S_CMP_EQ_U32 $sgpr11, 0, implicit-def $scc + S_CBRANCH_SCC1 %bb.2, implicit killed $scc + S_BRANCH %bb.1 + bb.4: + liveins: $sgpr10, $sgpr30_sgpr31 + S_NOP 0 + S_SETPC_B64 $sgpr30_sgpr31, implicit $sgpr10 +... diff --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll @@ -0,0 +1,85 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -O0 -verify-machineinstrs -o - %s | FileCheck %s + +; Regression test for `processFunctionBeforeFrameFinalized`: +; Check that it correctly updates RegisterScavenger so we +; don't end up with bad machine code due to using undefined +; physical registers. + +define void @test() { +; CHECK-LABEL: test: +; CHECK: ; %bb.0: ; %bb.0 +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[4:5] +; CHECK-NEXT: .LBB0_1: ; %bb.1 +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: s_cbranch_scc1 .LBB0_3 +; CHECK-NEXT: ; %bb.2: ; %bb.2 +; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 +; CHECK-NEXT: .LBB0_3: ; %bb.3 +; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 +; CHECK-NEXT: ; implicit-def: $sgpr4 +; CHECK-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-NEXT: v_readfirstlane_b32 s6, v0 +; CHECK-NEXT: s_mov_b64 s[4:5], -1 +; CHECK-NEXT: s_mov_b32 s7, 0 +; CHECK-NEXT: s_cmp_eq_u32 s6, s7 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: v_writelane_b32 v0, s4, 0 +; CHECK-NEXT: v_writelane_b32 v0, s5, 1 +; CHECK-NEXT: s_mov_b64 s[10:11], exec +; CHECK-NEXT: s_mov_b64 exec, -1 +; CHECK-NEXT: v_accvgpr_write_b32 a0, v0 ; Reload Reuse +; CHECK-NEXT: s_mov_b64 exec, s[10:11] +; CHECK-NEXT: s_cbranch_scc1 .LBB0_5 +; CHECK-NEXT: ; %bb.4: ; %bb.4 +; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 +; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1 +; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 ; Reload Reuse +; CHECK-NEXT: s_mov_b64 exec, s[10:11] +; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: v_writelane_b32 v0, s4, 0 +; CHECK-NEXT: v_writelane_b32 v0, s5, 1 +; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: v_accvgpr_write_b32 a0, v0 ; Reload Reuse +; CHECK-NEXT: s_mov_b64 exec, s[10:11] +; CHECK-NEXT: .LBB0_5: ; %Flow +; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 +; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 ; Reload Reuse +; CHECK-NEXT: s_mov_b64 exec, s[10:11] +; CHECK-NEXT: v_readlane_b32 s4, v0, 0 +; CHECK-NEXT: v_readlane_b32 s5, v0, 1 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; CHECK-NEXT: s_mov_b32 s4, 1 +; CHECK-NEXT: ; implicit-def: $sgpr5 +; CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], v0, s4 +; CHECK-NEXT: s_and_b64 vcc, exec, s[4:5] +; CHECK-NEXT: s_cbranch_vccnz .LBB0_1 +; CHECK-NEXT: ; %bb.6: ; %bb.5 +; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[4:5] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] +bb.0: + br label %bb.1 +bb.1: ; preds = %bb.4, %bb.0 + br i1 poison, label %bb.2, label %bb.3 +bb.2: ; preds = %bb.1 + br label %bb.3 +bb.3: ; preds = %bb.2, %bb.1 + %call = tail call i32 @llvm.amdgcn.readfirstlane(i32 poison) + %cmp = icmp eq i32 %call, 0 + br i1 %cmp, label %bb.5, label %bb.4 +bb.4: ; preds = %bb.3 + br label %bb.1 +bb.5: ; preds = %bb.3 + ret void +} + +declare i32 @llvm.amdgcn.readfirstlane(i32) diff --git a/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll b/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll --- a/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll @@ -10,16 +10,17 @@ ; GCN-LABEL: sgpr_spill_writelane: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_xor_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: v_writelane_b32 v0, s35, 0 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: v_readlane_b32 s35, v0, 0 -; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_xor_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] call void asm sideeffect "", "~{s35}"() diff --git a/llvm/test/CodeGen/AMDGPU/spill192.mir b/llvm/test/CodeGen/AMDGPU/spill192.mir --- a/llvm/test/CodeGen/AMDGPU/spill192.mir +++ b/llvm/test/CodeGen/AMDGPU/spill192.mir @@ -32,32 +32,29 @@ ; EXPANDED-LABEL: name: spill_restore_sgpr192 ; EXPANDED: bb.0: ; EXPANDED-NEXT: successors: %bb.1(0x80000000) - ; EXPANDED-NEXT: liveins: $vgpr0 ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, $vgpr0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr5, 1, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr6, 2, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr7, 3, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr8, 4, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr9, 5, $vgpr0, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; EXPANDED-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr6, 2, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr7, 3, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr8, 4, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr9, 5, [[V_WRITELANE_B32_1]], implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 ; EXPANDED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: bb.1: ; EXPANDED-NEXT: successors: %bb.2(0x80000000) - ; EXPANDED-NEXT: liveins: $vgpr0 ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: S_NOP 1 ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: bb.2: - ; EXPANDED-NEXT: liveins: $vgpr0 - ; EXPANDED-NEXT: {{ $}} - ; EXPANDED-NEXT: $sgpr4 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 - ; EXPANDED-NEXT: $sgpr5 = V_READLANE_B32 $vgpr0, 1 - ; EXPANDED-NEXT: $sgpr6 = V_READLANE_B32 $vgpr0, 2 - ; EXPANDED-NEXT: $sgpr7 = V_READLANE_B32 $vgpr0, 3 - ; EXPANDED-NEXT: $sgpr8 = V_READLANE_B32 $vgpr0, 4 - ; EXPANDED-NEXT: $sgpr9 = V_READLANE_B32 $vgpr0, 5 + ; EXPANDED-NEXT: $sgpr4 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 + ; EXPANDED-NEXT: $sgpr5 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 1 + ; EXPANDED-NEXT: $sgpr6 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 2 + ; EXPANDED-NEXT: $sgpr7 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 3 + ; EXPANDED-NEXT: $sgpr8 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 4 + ; EXPANDED-NEXT: $sgpr9 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 5 ; EXPANDED-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 bb.0: S_NOP 0, implicit-def %0:sgpr_192 diff --git a/llvm/test/CodeGen/AMDGPU/spill224.mir b/llvm/test/CodeGen/AMDGPU/spill224.mir --- a/llvm/test/CodeGen/AMDGPU/spill224.mir +++ b/llvm/test/CodeGen/AMDGPU/spill224.mir @@ -30,34 +30,31 @@ ; EXPANDED-LABEL: name: spill_restore_sgpr224 ; EXPANDED: bb.0: ; EXPANDED-NEXT: successors: %bb.1(0x80000000) - ; EXPANDED-NEXT: liveins: $vgpr0 ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, $vgpr0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr5, 1, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr6, 2, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr7, 3, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr8, 4, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr9, 5, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 - ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr10, 6, $vgpr0, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 + ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; EXPANDED-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr6, 2, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr7, 3, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr8, 4, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr9, 5, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 + ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr10, 6, [[V_WRITELANE_B32_1]], implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 ; EXPANDED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: bb.1: ; EXPANDED-NEXT: successors: %bb.2(0x80000000) - ; EXPANDED-NEXT: liveins: $vgpr0 ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: S_NOP 1 ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: bb.2: - ; EXPANDED-NEXT: liveins: $vgpr0 - ; EXPANDED-NEXT: {{ $}} - ; EXPANDED-NEXT: $sgpr4 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 - ; EXPANDED-NEXT: $sgpr5 = V_READLANE_B32 $vgpr0, 1 - ; EXPANDED-NEXT: $sgpr6 = V_READLANE_B32 $vgpr0, 2 - ; EXPANDED-NEXT: $sgpr7 = V_READLANE_B32 $vgpr0, 3 - ; EXPANDED-NEXT: $sgpr8 = V_READLANE_B32 $vgpr0, 4 - ; EXPANDED-NEXT: $sgpr9 = V_READLANE_B32 $vgpr0, 5 - ; EXPANDED-NEXT: $sgpr10 = V_READLANE_B32 $vgpr0, 6 + ; EXPANDED-NEXT: $sgpr4 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 + ; EXPANDED-NEXT: $sgpr5 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 1 + ; EXPANDED-NEXT: $sgpr6 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 2 + ; EXPANDED-NEXT: $sgpr7 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 3 + ; EXPANDED-NEXT: $sgpr8 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 4 + ; EXPANDED-NEXT: $sgpr9 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 5 + ; EXPANDED-NEXT: $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 6 ; EXPANDED-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 bb.0: S_NOP 0, implicit-def %0:sgpr_224 diff --git a/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll b/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll --- a/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll +++ b/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll @@ -22,8 +22,9 @@ ; GCN-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[34:35] -; GCN-NEXT: v_writelane_b32 v1, s4, 0 +; GCN-NEXT: ; implicit-def: $vgpr1 ; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: v_writelane_b32 v1, s4, 0 ; GCN-NEXT: v_writelane_b32 v1, s30, 1 ; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 ; GCN-NEXT: s_mov_b32 s4, 2.0 diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll --- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll +++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll @@ -10,18 +10,20 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_mov_b32 s16, s33 ; GFX90A-NEXT: s_mov_b32 s33, s32 -; GFX90A-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX90A-NEXT: s_xor_saveexec_b64 s[18:19], -1 +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX90A-NEXT: s_mov_b64 exec, -1 +; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX90A-NEXT: s_mov_b64 exec, s[18:19] ; GFX90A-NEXT: s_addk_i32 s32, 0x400 -; GFX90A-NEXT: v_writelane_b32 v41, s16, 0 +; GFX90A-NEXT: v_writelane_b32 v40, s16, 0 ; GFX90A-NEXT: s_getpc_b64 s[16:17] ; GFX90A-NEXT: s_add_u32 s16, s16, wobble@gotpcrel32@lo+4 ; GFX90A-NEXT: s_addc_u32 s17, s17, wobble@gotpcrel32@hi+12 ; GFX90A-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 -; GFX90A-NEXT: v_writelane_b32 v40, s30, 0 -; GFX90A-NEXT: v_writelane_b32 v40, s31, 1 +; GFX90A-NEXT: ; implicit-def: $vgpr0 +; GFX90A-NEXT: v_writelane_b32 v0, s30, 0 +; GFX90A-NEXT: v_writelane_b32 v0, s31, 1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_swappc_b64 s[30:31], s[16:17] bb: @@ -35,12 +37,12 @@ ; GLOBALNESS1-NEXT: s_mov_b64 s[54:55], s[6:7] ; GLOBALNESS1-NEXT: s_load_dwordx4 s[36:39], s[8:9], 0x0 ; GLOBALNESS1-NEXT: s_load_dword s6, s[8:9], 0x14 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v0 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v40, 0 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, v0 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v42, 0 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], 0, 0 -; GLOBALNESS1-NEXT: global_store_dword v[0:1], v40, off +; GLOBALNESS1-NEXT: global_store_dword v[0:1], v42, off ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS1-NEXT: global_load_dword v0, v40, s[36:37] +; GLOBALNESS1-NEXT: global_load_dword v0, v42, s[36:37] ; GLOBALNESS1-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GLOBALNESS1-NEXT: s_mov_b64 s[64:65], s[4:5] ; GLOBALNESS1-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x18 @@ -48,11 +50,11 @@ ; GLOBALNESS1-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GLOBALNESS1-NEXT: s_add_u32 s0, s0, s17 ; GLOBALNESS1-NEXT: s_addc_u32 s1, s1, 0 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, 0x40994400 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, 0x40994400 ; GLOBALNESS1-NEXT: s_bitcmp1_b32 s38, 0 ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e64 s[36:37], s[4:5], v[40:41] -; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e64 s[40:41], s[4:5], 0 +; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e64 s[40:41], s[4:5], v[42:43] +; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e64 s[42:43], s[4:5], 0 ; GLOBALNESS1-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GLOBALNESS1-NEXT: s_xor_b64 s[94:95], s[4:5], -1 ; GLOBALNESS1-NEXT: s_bitcmp1_b32 s6, 0 @@ -65,33 +67,34 @@ ; GLOBALNESS1-NEXT: s_add_u32 s6, s6, wobble@gotpcrel32@lo+4 ; GLOBALNESS1-NEXT: s_addc_u32 s7, s7, wobble@gotpcrel32@hi+12 ; GLOBALNESS1-NEXT: s_xor_b64 s[86:87], s[4:5], -1 +; GLOBALNESS1-NEXT: ; implicit-def: $vgpr40 ; GLOBALNESS1-NEXT: s_load_dwordx2 s[66:67], s[6:7], 0x0 ; GLOBALNESS1-NEXT: s_mov_b32 s98, s16 ; GLOBALNESS1-NEXT: s_mov_b64 s[62:63], s[8:9] ; GLOBALNESS1-NEXT: s_mov_b32 s99, s15 -; GLOBALNESS1-NEXT: s_mov_b32 s100, s14 +; GLOBALNESS1-NEXT: s_mov_b32 s56, s14 ; GLOBALNESS1-NEXT: s_mov_b64 s[34:35], s[10:11] ; GLOBALNESS1-NEXT: s_mov_b64 s[92:93], 0x80 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[42:43], 1, v1 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[36:37], 1, v1 ; GLOBALNESS1-NEXT: s_mov_b32 s69, 0x3ff00000 ; GLOBALNESS1-NEXT: s_mov_b32 s32, 0 ; GLOBALNESS1-NEXT: ; implicit-def: $agpr32_agpr33_agpr34_agpr35_agpr36_agpr37_agpr38_agpr39_agpr40_agpr41_agpr42_agpr43_agpr44_agpr45_agpr46_agpr47_agpr48_agpr49_agpr50_agpr51_agpr52_agpr53_agpr54_agpr55_agpr56_agpr57_agpr58_agpr59_agpr60_agpr61_agpr62_agpr63 ; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0) ; GLOBALNESS1-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v0 -; GLOBALNESS1-NEXT: v_writelane_b32 v42, s4, 0 -; GLOBALNESS1-NEXT: v_writelane_b32 v42, s5, 1 +; GLOBALNESS1-NEXT: v_writelane_b32 v40, s4, 0 +; GLOBALNESS1-NEXT: v_writelane_b32 v40, s5, 1 ; GLOBALNESS1-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 -; GLOBALNESS1-NEXT: v_writelane_b32 v42, s4, 2 -; GLOBALNESS1-NEXT: v_writelane_b32 v42, s5, 3 +; GLOBALNESS1-NEXT: v_writelane_b32 v40, s4, 2 +; GLOBALNESS1-NEXT: v_writelane_b32 v40, s5, 3 ; GLOBALNESS1-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 -; GLOBALNESS1-NEXT: v_writelane_b32 v42, s4, 4 +; GLOBALNESS1-NEXT: v_writelane_b32 v40, s4, 4 ; GLOBALNESS1-NEXT: v_cmp_gt_i32_e64 s[90:91], 1, v0 -; GLOBALNESS1-NEXT: v_writelane_b32 v42, s5, 5 +; GLOBALNESS1-NEXT: v_writelane_b32 v40, s5, 5 ; GLOBALNESS1-NEXT: s_branch .LBB1_4 ; GLOBALNESS1-NEXT: .LBB1_1: ; %bb70.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: v_readlane_b32 s6, v42, 4 -; GLOBALNESS1-NEXT: v_readlane_b32 s7, v42, 5 +; GLOBALNESS1-NEXT: v_readlane_b32 s6, v40, 4 +; GLOBALNESS1-NEXT: v_readlane_b32 s7, v40, 5 ; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_29 ; GLOBALNESS1-NEXT: .LBB1_2: ; %Flow6 @@ -141,19 +144,19 @@ ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], s[92:93], s[92:93] op_sel:[0,1] ; GLOBALNESS1-NEXT: flat_load_dword v44, v[0:1] ; GLOBALNESS1-NEXT: s_add_u32 s8, s62, 40 -; GLOBALNESS1-NEXT: buffer_store_dword v40, off, s[0:3], 0 +; GLOBALNESS1-NEXT: buffer_store_dword v42, off, s[0:3], 0 ; GLOBALNESS1-NEXT: flat_load_dword v45, v[0:1] ; GLOBALNESS1-NEXT: s_addc_u32 s9, s63, 0 ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[64:65] ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[54:55] ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS1-NEXT: s_mov_b32 s12, s100 +; GLOBALNESS1-NEXT: s_mov_b32 s12, s56 ; GLOBALNESS1-NEXT: s_mov_b32 s13, s99 ; GLOBALNESS1-NEXT: s_mov_b32 s14, s98 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v43 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) ; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[66:67] -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[42:43] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[36:37] ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], -1 ; GLOBALNESS1-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_8 @@ -220,23 +223,23 @@ ; GLOBALNESS1-NEXT: ; %bb.10: ; %bb33.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: global_load_dwordx2 v[0:1], v[32:33], off -; GLOBALNESS1-NEXT: v_readlane_b32 s4, v42, 0 -; GLOBALNESS1-NEXT: v_readlane_b32 s5, v42, 1 +; GLOBALNESS1-NEXT: v_readlane_b32 s4, v40, 0 +; GLOBALNESS1-NEXT: v_readlane_b32 s5, v40, 1 +; GLOBALNESS1-NEXT: s_mov_b64 s[72:73], s[36:37] +; GLOBALNESS1-NEXT: s_mov_b32 s75, s39 ; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_12 ; GLOBALNESS1-NEXT: ; %bb.11: ; %bb39.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, v40 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v42 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[2:3], 0, 0 -; GLOBALNESS1-NEXT: global_store_dwordx2 v[2:3], v[40:41], off +; GLOBALNESS1-NEXT: global_store_dwordx2 v[2:3], v[42:43], off ; GLOBALNESS1-NEXT: .LBB1_12: ; %bb44.lr.ph.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: v_cmp_ne_u32_e32 vcc, 0, v45 ; GLOBALNESS1-NEXT: v_cndmask_b32_e32 v2, 0, v44, vcc -; GLOBALNESS1-NEXT: s_mov_b64 s[72:73], s[42:43] -; GLOBALNESS1-NEXT: s_mov_b32 s75, s39 ; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0) -; GLOBALNESS1-NEXT: v_cmp_nlt_f64_e64 s[56:57], 0, v[0:1] +; GLOBALNESS1-NEXT: v_cmp_nlt_f64_e64 s[36:37], 0, v[0:1] ; GLOBALNESS1-NEXT: v_cmp_eq_u32_e64 s[58:59], 0, v2 ; GLOBALNESS1-NEXT: s_branch .LBB1_15 ; GLOBALNESS1-NEXT: .LBB1_13: ; %Flow7 @@ -257,15 +260,15 @@ ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_14 ; GLOBALNESS1-NEXT: ; %bb.17: ; %bb50.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[40:41] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_20 ; GLOBALNESS1-NEXT: ; %bb.18: ; %bb3.i.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[40:41] +; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[42:43] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_20 ; GLOBALNESS1-NEXT: ; %bb.19: ; %bb6.i.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[56:57] +; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GLOBALNESS1-NEXT: .LBB1_20: ; %spam.exit.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_15 Depth=2 ; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[90:91] @@ -278,28 +281,28 @@ ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[54:55] ; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], s[60:61] ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS1-NEXT: s_mov_b32 s12, s100 +; GLOBALNESS1-NEXT: s_mov_b32 s12, s56 ; GLOBALNESS1-NEXT: s_mov_b32 s13, s99 ; GLOBALNESS1-NEXT: s_mov_b32 s14, s98 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v43 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[66:67] ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[44:45], 0, 0 ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[64:65] ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[54:55] ; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], s[60:61] ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS1-NEXT: s_mov_b32 s12, s100 +; GLOBALNESS1-NEXT: s_mov_b32 s12, s56 ; GLOBALNESS1-NEXT: s_mov_b32 s13, s99 ; GLOBALNESS1-NEXT: s_mov_b32 s14, s98 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v43 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], a[32:33], off ; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[66:67] ; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[58:59] ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_13 ; GLOBALNESS1-NEXT: ; %bb.22: ; %bb62.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, v40 -; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[40:41], off +; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v42 +; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[42:43], off ; GLOBALNESS1-NEXT: s_branch .LBB1_13 ; GLOBALNESS1-NEXT: .LBB1_23: ; %LeafBlock ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -314,14 +317,14 @@ ; GLOBALNESS1-NEXT: s_branch .LBB1_3 ; GLOBALNESS1-NEXT: .LBB1_25: ; %Flow14 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[36:37] ; GLOBALNESS1-NEXT: s_mov_b32 s36, s93 ; GLOBALNESS1-NEXT: s_mov_b32 s37, s93 ; GLOBALNESS1-NEXT: s_mov_b32 s38, s93 ; GLOBALNESS1-NEXT: s_mov_b32 s39, s93 -; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[40:41] +; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41] ; GLOBALNESS1-NEXT: s_mov_b32 s40, s93 ; GLOBALNESS1-NEXT: s_mov_b32 s41, s93 +; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[42:43] ; GLOBALNESS1-NEXT: s_mov_b32 s42, s93 ; GLOBALNESS1-NEXT: s_mov_b32 s43, s93 ; GLOBALNESS1-NEXT: s_mov_b32 s44, s93 @@ -350,10 +353,10 @@ ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[26:27], s[62:63], s[62:63] op_sel:[0,1] ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[28:29], s[64:65], s[64:65] op_sel:[0,1] ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[30:31], s[66:67], s[66:67] op_sel:[0,1] -; GLOBALNESS1-NEXT: s_mov_b64 s[40:41], s[6:7] -; GLOBALNESS1-NEXT: s_mov_b64 s[36:37], s[4:5] +; GLOBALNESS1-NEXT: s_mov_b64 s[42:43], s[6:7] +; GLOBALNESS1-NEXT: s_mov_b64 s[40:41], s[4:5] ; GLOBALNESS1-NEXT: s_mov_b32 s39, s75 -; GLOBALNESS1-NEXT: s_mov_b64 s[42:43], s[72:73] +; GLOBALNESS1-NEXT: s_mov_b64 s[36:37], s[72:73] ; GLOBALNESS1-NEXT: .LBB1_26: ; %Flow15 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[70:71] @@ -361,21 +364,21 @@ ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_2 ; GLOBALNESS1-NEXT: ; %bb.27: ; %bb67.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: v_readlane_b32 s6, v42, 2 -; GLOBALNESS1-NEXT: v_readlane_b32 s7, v42, 3 +; GLOBALNESS1-NEXT: v_readlane_b32 s6, v40, 2 +; GLOBALNESS1-NEXT: v_readlane_b32 s7, v40, 3 ; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_1 ; GLOBALNESS1-NEXT: ; %bb.28: ; %bb69.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, v40 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v42 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[32:33], 0, 0 -; GLOBALNESS1-NEXT: global_store_dwordx2 v[32:33], v[40:41], off +; GLOBALNESS1-NEXT: global_store_dwordx2 v[32:33], v[42:43], off ; GLOBALNESS1-NEXT: s_branch .LBB1_1 ; GLOBALNESS1-NEXT: .LBB1_29: ; %bb73.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, v40 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v42 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[32:33], 0, 0 -; GLOBALNESS1-NEXT: global_store_dwordx2 v[32:33], v[40:41], off +; GLOBALNESS1-NEXT: global_store_dwordx2 v[32:33], v[42:43], off ; GLOBALNESS1-NEXT: s_branch .LBB1_2 ; GLOBALNESS1-NEXT: .LBB1_30: ; %loop.exit.guard ; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[4:5] @@ -387,10 +390,10 @@ ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[64:65] ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[54:55] ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS1-NEXT: s_mov_b32 s12, s100 +; GLOBALNESS1-NEXT: s_mov_b32 s12, s56 ; GLOBALNESS1-NEXT: s_mov_b32 s13, s99 ; GLOBALNESS1-NEXT: s_mov_b32 s14, s98 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v43 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS1-NEXT: s_getpc_b64 s[16:17] ; GLOBALNESS1-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 ; GLOBALNESS1-NEXT: s_addc_u32 s17, s17, widget@rel32@hi+12 @@ -405,10 +408,10 @@ ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[64:65] ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[54:55] ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS1-NEXT: s_mov_b32 s12, s100 +; GLOBALNESS1-NEXT: s_mov_b32 s12, s56 ; GLOBALNESS1-NEXT: s_mov_b32 s13, s99 ; GLOBALNESS1-NEXT: s_mov_b32 s14, s98 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v43 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS1-NEXT: s_getpc_b64 s[16:17] ; GLOBALNESS1-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 ; GLOBALNESS1-NEXT: s_addc_u32 s17, s17, widget@rel32@hi+12 @@ -420,12 +423,12 @@ ; GLOBALNESS0-NEXT: s_mov_b64 s[54:55], s[6:7] ; GLOBALNESS0-NEXT: s_load_dwordx4 s[36:39], s[8:9], 0x0 ; GLOBALNESS0-NEXT: s_load_dword s6, s[8:9], 0x14 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v0 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v40, 0 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, v0 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v42, 0 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], 0, 0 -; GLOBALNESS0-NEXT: global_store_dword v[0:1], v40, off +; GLOBALNESS0-NEXT: global_store_dword v[0:1], v42, off ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS0-NEXT: global_load_dword v0, v40, s[36:37] +; GLOBALNESS0-NEXT: global_load_dword v0, v42, s[36:37] ; GLOBALNESS0-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GLOBALNESS0-NEXT: s_mov_b64 s[62:63], s[4:5] ; GLOBALNESS0-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x18 @@ -433,11 +436,11 @@ ; GLOBALNESS0-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GLOBALNESS0-NEXT: s_add_u32 s0, s0, s17 ; GLOBALNESS0-NEXT: s_addc_u32 s1, s1, 0 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, 0x40994400 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, 0x40994400 ; GLOBALNESS0-NEXT: s_bitcmp1_b32 s38, 0 ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e64 s[36:37], s[4:5], v[40:41] -; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e64 s[40:41], s[4:5], 0 +; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e64 s[40:41], s[4:5], v[42:43] +; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e64 s[42:43], s[4:5], 0 ; GLOBALNESS0-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GLOBALNESS0-NEXT: s_xor_b64 s[94:95], s[4:5], -1 ; GLOBALNESS0-NEXT: s_bitcmp1_b32 s6, 0 @@ -450,33 +453,34 @@ ; GLOBALNESS0-NEXT: s_add_u32 s6, s6, wobble@gotpcrel32@lo+4 ; GLOBALNESS0-NEXT: s_addc_u32 s7, s7, wobble@gotpcrel32@hi+12 ; GLOBALNESS0-NEXT: s_xor_b64 s[86:87], s[4:5], -1 +; GLOBALNESS0-NEXT: ; implicit-def: $vgpr40 ; GLOBALNESS0-NEXT: s_load_dwordx2 s[66:67], s[6:7], 0x0 ; GLOBALNESS0-NEXT: s_mov_b32 s98, s16 ; GLOBALNESS0-NEXT: s_mov_b64 s[60:61], s[8:9] ; GLOBALNESS0-NEXT: s_mov_b32 s99, s15 -; GLOBALNESS0-NEXT: s_mov_b32 s100, s14 +; GLOBALNESS0-NEXT: s_mov_b32 s56, s14 ; GLOBALNESS0-NEXT: s_mov_b64 s[34:35], s[10:11] ; GLOBALNESS0-NEXT: s_mov_b64 s[92:93], 0x80 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[42:43], 1, v1 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[36:37], 1, v1 ; GLOBALNESS0-NEXT: s_mov_b32 s69, 0x3ff00000 ; GLOBALNESS0-NEXT: s_mov_b32 s32, 0 ; GLOBALNESS0-NEXT: ; implicit-def: $agpr32_agpr33_agpr34_agpr35_agpr36_agpr37_agpr38_agpr39_agpr40_agpr41_agpr42_agpr43_agpr44_agpr45_agpr46_agpr47_agpr48_agpr49_agpr50_agpr51_agpr52_agpr53_agpr54_agpr55_agpr56_agpr57_agpr58_agpr59_agpr60_agpr61_agpr62_agpr63 ; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) ; GLOBALNESS0-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v0 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s4, 0 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s5, 1 +; GLOBALNESS0-NEXT: v_writelane_b32 v40, s4, 0 +; GLOBALNESS0-NEXT: v_writelane_b32 v40, s5, 1 ; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s4, 2 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s5, 3 +; GLOBALNESS0-NEXT: v_writelane_b32 v40, s4, 2 +; GLOBALNESS0-NEXT: v_writelane_b32 v40, s5, 3 ; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s4, 4 +; GLOBALNESS0-NEXT: v_writelane_b32 v40, s4, 4 ; GLOBALNESS0-NEXT: v_cmp_gt_i32_e64 s[90:91], 1, v0 -; GLOBALNESS0-NEXT: v_writelane_b32 v42, s5, 5 +; GLOBALNESS0-NEXT: v_writelane_b32 v40, s5, 5 ; GLOBALNESS0-NEXT: s_branch .LBB1_4 ; GLOBALNESS0-NEXT: .LBB1_1: ; %bb70.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_readlane_b32 s6, v42, 4 -; GLOBALNESS0-NEXT: v_readlane_b32 s7, v42, 5 +; GLOBALNESS0-NEXT: v_readlane_b32 s6, v40, 4 +; GLOBALNESS0-NEXT: v_readlane_b32 s7, v40, 5 ; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_29 ; GLOBALNESS0-NEXT: .LBB1_2: ; %Flow6 @@ -526,19 +530,19 @@ ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], s[92:93], s[92:93] op_sel:[0,1] ; GLOBALNESS0-NEXT: flat_load_dword v44, v[0:1] ; GLOBALNESS0-NEXT: s_add_u32 s8, s60, 40 -; GLOBALNESS0-NEXT: buffer_store_dword v40, off, s[0:3], 0 +; GLOBALNESS0-NEXT: buffer_store_dword v42, off, s[0:3], 0 ; GLOBALNESS0-NEXT: flat_load_dword v45, v[0:1] ; GLOBALNESS0-NEXT: s_addc_u32 s9, s61, 0 ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[62:63] ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[54:55] ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS0-NEXT: s_mov_b32 s12, s100 +; GLOBALNESS0-NEXT: s_mov_b32 s12, s56 ; GLOBALNESS0-NEXT: s_mov_b32 s13, s99 ; GLOBALNESS0-NEXT: s_mov_b32 s14, s98 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v43 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) ; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[66:67] -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[42:43] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[36:37] ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], -1 ; GLOBALNESS0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_8 @@ -605,23 +609,23 @@ ; GLOBALNESS0-NEXT: ; %bb.10: ; %bb33.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: global_load_dwordx2 v[0:1], v[32:33], off -; GLOBALNESS0-NEXT: v_readlane_b32 s4, v42, 0 -; GLOBALNESS0-NEXT: v_readlane_b32 s5, v42, 1 +; GLOBALNESS0-NEXT: v_readlane_b32 s4, v40, 0 +; GLOBALNESS0-NEXT: v_readlane_b32 s5, v40, 1 +; GLOBALNESS0-NEXT: s_mov_b64 s[72:73], s[36:37] +; GLOBALNESS0-NEXT: s_mov_b32 s75, s39 ; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_12 ; GLOBALNESS0-NEXT: ; %bb.11: ; %bb39.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, v40 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v42 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], 0, 0 -; GLOBALNESS0-NEXT: global_store_dwordx2 v[2:3], v[40:41], off +; GLOBALNESS0-NEXT: global_store_dwordx2 v[2:3], v[42:43], off ; GLOBALNESS0-NEXT: .LBB1_12: ; %bb44.lr.ph.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e32 vcc, 0, v45 ; GLOBALNESS0-NEXT: v_cndmask_b32_e32 v2, 0, v44, vcc -; GLOBALNESS0-NEXT: s_mov_b64 s[72:73], s[42:43] -; GLOBALNESS0-NEXT: s_mov_b32 s75, s39 ; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) -; GLOBALNESS0-NEXT: v_cmp_nlt_f64_e64 s[56:57], 0, v[0:1] +; GLOBALNESS0-NEXT: v_cmp_nlt_f64_e64 s[36:37], 0, v[0:1] ; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[58:59], 0, v2 ; GLOBALNESS0-NEXT: s_branch .LBB1_15 ; GLOBALNESS0-NEXT: .LBB1_13: ; %Flow7 @@ -642,15 +646,15 @@ ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_14 ; GLOBALNESS0-NEXT: ; %bb.17: ; %bb50.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[36:37] +; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[40:41] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_20 ; GLOBALNESS0-NEXT: ; %bb.18: ; %bb3.i.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[40:41] +; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[42:43] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_20 ; GLOBALNESS0-NEXT: ; %bb.19: ; %bb6.i.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[56:57] +; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GLOBALNESS0-NEXT: .LBB1_20: ; %spam.exit.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_15 Depth=2 ; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[90:91] @@ -663,28 +667,28 @@ ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[54:55] ; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], s[64:65] ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS0-NEXT: s_mov_b32 s12, s100 +; GLOBALNESS0-NEXT: s_mov_b32 s12, s56 ; GLOBALNESS0-NEXT: s_mov_b32 s13, s99 ; GLOBALNESS0-NEXT: s_mov_b32 s14, s98 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v43 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[66:67] ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[44:45], 0, 0 ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[62:63] ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[54:55] ; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], s[64:65] ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS0-NEXT: s_mov_b32 s12, s100 +; GLOBALNESS0-NEXT: s_mov_b32 s12, s56 ; GLOBALNESS0-NEXT: s_mov_b32 s13, s99 ; GLOBALNESS0-NEXT: s_mov_b32 s14, s98 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v43 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], a[32:33], off ; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[66:67] ; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[58:59] ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_13 ; GLOBALNESS0-NEXT: ; %bb.22: ; %bb62.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, v40 -; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[40:41], off +; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v42 +; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[42:43], off ; GLOBALNESS0-NEXT: s_branch .LBB1_13 ; GLOBALNESS0-NEXT: .LBB1_23: ; %LeafBlock ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -699,14 +703,14 @@ ; GLOBALNESS0-NEXT: s_branch .LBB1_3 ; GLOBALNESS0-NEXT: .LBB1_25: ; %Flow14 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[36:37] ; GLOBALNESS0-NEXT: s_mov_b32 s36, s93 ; GLOBALNESS0-NEXT: s_mov_b32 s37, s93 ; GLOBALNESS0-NEXT: s_mov_b32 s38, s93 ; GLOBALNESS0-NEXT: s_mov_b32 s39, s93 -; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[40:41] +; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41] ; GLOBALNESS0-NEXT: s_mov_b32 s40, s93 ; GLOBALNESS0-NEXT: s_mov_b32 s41, s93 +; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[42:43] ; GLOBALNESS0-NEXT: s_mov_b32 s42, s93 ; GLOBALNESS0-NEXT: s_mov_b32 s43, s93 ; GLOBALNESS0-NEXT: s_mov_b32 s44, s93 @@ -735,10 +739,10 @@ ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[26:27], s[62:63], s[62:63] op_sel:[0,1] ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[28:29], s[64:65], s[64:65] op_sel:[0,1] ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[30:31], s[66:67], s[66:67] op_sel:[0,1] -; GLOBALNESS0-NEXT: s_mov_b64 s[40:41], s[6:7] -; GLOBALNESS0-NEXT: s_mov_b64 s[36:37], s[4:5] +; GLOBALNESS0-NEXT: s_mov_b64 s[42:43], s[6:7] +; GLOBALNESS0-NEXT: s_mov_b64 s[40:41], s[4:5] ; GLOBALNESS0-NEXT: s_mov_b32 s39, s75 -; GLOBALNESS0-NEXT: s_mov_b64 s[42:43], s[72:73] +; GLOBALNESS0-NEXT: s_mov_b64 s[36:37], s[72:73] ; GLOBALNESS0-NEXT: .LBB1_26: ; %Flow15 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[70:71] @@ -746,21 +750,21 @@ ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_2 ; GLOBALNESS0-NEXT: ; %bb.27: ; %bb67.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_readlane_b32 s6, v42, 2 -; GLOBALNESS0-NEXT: v_readlane_b32 s7, v42, 3 +; GLOBALNESS0-NEXT: v_readlane_b32 s6, v40, 2 +; GLOBALNESS0-NEXT: v_readlane_b32 s7, v40, 3 ; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_1 ; GLOBALNESS0-NEXT: ; %bb.28: ; %bb69.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, v40 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v42 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[32:33], 0, 0 -; GLOBALNESS0-NEXT: global_store_dwordx2 v[32:33], v[40:41], off +; GLOBALNESS0-NEXT: global_store_dwordx2 v[32:33], v[42:43], off ; GLOBALNESS0-NEXT: s_branch .LBB1_1 ; GLOBALNESS0-NEXT: .LBB1_29: ; %bb73.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, v40 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v42 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[32:33], 0, 0 -; GLOBALNESS0-NEXT: global_store_dwordx2 v[32:33], v[40:41], off +; GLOBALNESS0-NEXT: global_store_dwordx2 v[32:33], v[42:43], off ; GLOBALNESS0-NEXT: s_branch .LBB1_2 ; GLOBALNESS0-NEXT: .LBB1_30: ; %loop.exit.guard ; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[4:5] @@ -772,10 +776,10 @@ ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[62:63] ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[54:55] ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS0-NEXT: s_mov_b32 s12, s100 +; GLOBALNESS0-NEXT: s_mov_b32 s12, s56 ; GLOBALNESS0-NEXT: s_mov_b32 s13, s99 ; GLOBALNESS0-NEXT: s_mov_b32 s14, s98 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v43 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS0-NEXT: s_getpc_b64 s[16:17] ; GLOBALNESS0-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 ; GLOBALNESS0-NEXT: s_addc_u32 s17, s17, widget@rel32@hi+12 @@ -790,10 +794,10 @@ ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[62:63] ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[54:55] ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] -; GLOBALNESS0-NEXT: s_mov_b32 s12, s100 +; GLOBALNESS0-NEXT: s_mov_b32 s12, s56 ; GLOBALNESS0-NEXT: s_mov_b32 s13, s99 ; GLOBALNESS0-NEXT: s_mov_b32 s14, s98 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v43 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS0-NEXT: s_getpc_b64 s[16:17] ; GLOBALNESS0-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 ; GLOBALNESS0-NEXT: s_addc_u32 s17, s17, widget@rel32@hi+12 diff --git a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll --- a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll +++ b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll @@ -14,6 +14,7 @@ ; GCN-NEXT: s_mov_b64 exec, s[18:19] ; GCN-NEXT: v_writelane_b32 v41, s16, 0 ; GCN-NEXT: s_addk_i32 s32, 0x400 +; GCN-NEXT: ; implicit-def: $vgpr40 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: v_mov_b32_e32 v0, 0 @@ -189,36 +190,38 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s16, s33 ; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: s_xor_saveexec_b64 s[18:19], -1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, -1 +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[18:19] -; GCN-NEXT: v_writelane_b32 v46, s16, 0 +; GCN-NEXT: v_writelane_b32 v45, s16, 0 ; GCN-NEXT: s_addk_i32 s32, 0x800 -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_writelane_b32 v40, s30, 0 -; GCN-NEXT: v_writelane_b32 v40, s31, 1 -; GCN-NEXT: v_writelane_b32 v40, s34, 2 -; GCN-NEXT: v_writelane_b32 v40, s35, 3 -; GCN-NEXT: v_writelane_b32 v40, s36, 4 -; GCN-NEXT: v_writelane_b32 v40, s37, 5 -; GCN-NEXT: v_writelane_b32 v40, s38, 6 -; GCN-NEXT: v_writelane_b32 v40, s39, 7 -; GCN-NEXT: v_writelane_b32 v40, s40, 8 -; GCN-NEXT: v_writelane_b32 v40, s41, 9 -; GCN-NEXT: v_writelane_b32 v40, s42, 10 -; GCN-NEXT: v_writelane_b32 v40, s43, 11 -; GCN-NEXT: v_writelane_b32 v40, s44, 12 -; GCN-NEXT: v_writelane_b32 v40, s45, 13 -; GCN-NEXT: v_writelane_b32 v40, s46, 14 -; GCN-NEXT: v_writelane_b32 v40, s47, 15 -; GCN-NEXT: v_writelane_b32 v40, s48, 16 -; GCN-NEXT: v_writelane_b32 v40, s49, 17 -; GCN-NEXT: v_mov_b32_e32 v41, v31 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: v_writelane_b32 v0, s30, 0 +; GCN-NEXT: v_writelane_b32 v0, s31, 1 +; GCN-NEXT: v_writelane_b32 v0, s34, 2 +; GCN-NEXT: v_writelane_b32 v0, s35, 3 +; GCN-NEXT: v_writelane_b32 v0, s36, 4 +; GCN-NEXT: v_writelane_b32 v0, s37, 5 +; GCN-NEXT: v_writelane_b32 v0, s38, 6 +; GCN-NEXT: v_writelane_b32 v0, s39, 7 +; GCN-NEXT: v_writelane_b32 v0, s40, 8 +; GCN-NEXT: v_writelane_b32 v0, s41, 9 +; GCN-NEXT: v_writelane_b32 v0, s42, 10 +; GCN-NEXT: v_writelane_b32 v0, s43, 11 +; GCN-NEXT: v_writelane_b32 v0, s44, 12 +; GCN-NEXT: v_writelane_b32 v0, s45, 13 +; GCN-NEXT: v_writelane_b32 v0, s46, 14 +; GCN-NEXT: v_writelane_b32 v0, s47, 15 +; GCN-NEXT: v_writelane_b32 v0, s48, 16 +; GCN-NEXT: v_writelane_b32 v0, s49, 17 +; GCN-NEXT: v_mov_b32_e32 v40, v31 ; GCN-NEXT: s_mov_b32 s44, s15 ; GCN-NEXT: s_mov_b32 s45, s14 ; GCN-NEXT: s_mov_b32 s46, s13 @@ -230,24 +233,24 @@ ; GCN-NEXT: s_mov_b64 s[4:5], 0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: v_and_b32_e32 v2, 0x3ff, v41 -; GCN-NEXT: v_mov_b32_e32 v43, 0 -; GCN-NEXT: flat_load_dword v44, v[0:1] -; GCN-NEXT: v_mov_b32_e32 v45, 0x7fc00000 +; GCN-NEXT: v_and_b32_e32 v2, 0x3ff, v40 +; GCN-NEXT: v_mov_b32_e32 v42, 0 +; GCN-NEXT: flat_load_dword v43, v[0:1] +; GCN-NEXT: v_mov_b32_e32 v44, 0x7fc00000 ; GCN-NEXT: s_getpc_b64 s[48:49] ; GCN-NEXT: s_add_u32 s48, s48, spam@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s49, s49, spam@rel32@hi+12 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 2, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v41, 2, v2 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_eq_f32_e64 s[42:43], 0, v44 +; GCN-NEXT: v_cmp_eq_f32_e64 s[42:43], 0, v43 ; GCN-NEXT: s_branch .LBB1_3 ; GCN-NEXT: .LBB1_1: ; %bb10 ; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 ; GCN-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], 0 ; GCN-NEXT: .LBB1_2: ; %bb18 ; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], 0 ; GCN-NEXT: s_mov_b64 s[4:5], 0 ; GCN-NEXT: .LBB1_3: ; %bb2 ; GCN-NEXT: ; =>This Loop Header: Depth=1 @@ -256,8 +259,8 @@ ; GCN-NEXT: .LBB1_4: ; %bb2 ; GCN-NEXT: ; Parent Loop BB1_3 Depth=1 ; GCN-NEXT: ; => This Inner Loop Header: Depth=2 -; GCN-NEXT: flat_load_dword v0, v[42:43] -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], 0 +; GCN-NEXT: flat_load_dword v0, v[41:42] +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], 0 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 3, v0 ; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc @@ -289,7 +292,7 @@ ; GCN-NEXT: s_mov_b32 s13, s46 ; GCN-NEXT: s_mov_b32 s14, s45 ; GCN-NEXT: s_mov_b32 s15, s44 -; GCN-NEXT: v_mov_b32_e32 v31, v41 +; GCN-NEXT: v_mov_b32_e32 v31, v40 ; GCN-NEXT: s_swappc_b64 s[30:31], s[48:49] ; GCN-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0 ; GCN-NEXT: s_mov_b64 s[4:5], 0 @@ -304,10 +307,10 @@ ; GCN-NEXT: ; %bb.9: ; %bb16 ; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], 0 ; GCN-NEXT: .LBB1_10: ; %bb17 ; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], 0 ; GCN-NEXT: s_branch .LBB1_2 bb: %tmp = load float, float* null, align 16 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll --- a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll @@ -14,7 +14,7 @@ ; GFX9-NEXT: s_mov_b32 s4, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v36, v16 @@ -22,10 +22,10 @@ ; GFX9-NEXT: v_mov_b32_e32 v34, v14 ; GFX9-NEXT: v_mov_b32_e32 v33, v13 ; GFX9-NEXT: v_mov_b32_e32 v32, v12 -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: ;;#ASMSTART @@ -34,30 +34,31 @@ ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: image_gather4_c_b_cl v[41:44], v[32:36], s[4:11], s[4:7] dmask:0x1 +; GFX9-NEXT: image_gather4_c_b_cl v[40:43], v[32:36], s[4:11], s[4:7] dmask:0x1 ; GFX9-NEXT: s_addk_i32 s32, 0x800 ; GFX9-NEXT: v_writelane_b32 v45, s4, 0 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: v_writelane_b32 v44, s30, 0 +; GFX9-NEXT: v_writelane_b32 v44, s31, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v0, v41 -; GFX9-NEXT: v_mov_b32_e32 v1, v42 -; GFX9-NEXT: v_mov_b32_e32 v2, v43 -; GFX9-NEXT: v_mov_b32_e32 v3, v44 -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, v40 +; GFX9-NEXT: v_mov_b32_e32 v1, v41 +; GFX9-NEXT: v_mov_b32_e32 v2, v42 +; GFX9-NEXT: v_mov_b32_e32 v3, v43 +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s31, v44, 1 +; GFX9-NEXT: v_readlane_b32 s30, v44, 0 ; GFX9-NEXT: v_readlane_b32 s4, v45, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: s_addk_i32 s32, 0xf800 @@ -72,7 +73,7 @@ ; GFX10-NEXT: s_mov_b32 s4, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s5, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s5 @@ -81,10 +82,10 @@ ; GFX10-NEXT: v_mov_b32_e32 v34, v14 ; GFX10-NEXT: v_mov_b32_e32 v33, v13 ; GFX10-NEXT: v_mov_b32_e32 v32, v12 -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: ;;#ASMSTART @@ -93,32 +94,33 @@ ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: image_gather4_c_b_cl v[41:44], v[32:36], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: image_gather4_c_b_cl v[40:43], v[32:36], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_addk_i32 s32, 0x400 ; GFX10-NEXT: v_writelane_b32 v45, s4, 0 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr44 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_writelane_b32 v44, s30, 0 +; GFX10-NEXT: v_writelane_b32 v44, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_mov_b32_e32 v0, v41 -; GFX10-NEXT: v_mov_b32_e32 v1, v42 -; GFX10-NEXT: v_mov_b32_e32 v2, v43 -; GFX10-NEXT: v_mov_b32_e32 v3, v44 +; GFX10-NEXT: v_mov_b32_e32 v0, v40 +; GFX10-NEXT: v_mov_b32_e32 v1, v41 +; GFX10-NEXT: v_mov_b32_e32 v2, v42 +; GFX10-NEXT: v_mov_b32_e32 v3, v43 ; GFX10-NEXT: s_clause 0x3 -; GFX10-NEXT: buffer_load_dword v44, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4 -; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 +; GFX10-NEXT: v_readlane_b32 s31, v44, 1 +; GFX10-NEXT: v_readlane_b32 s30, v44, 0 ; GFX10-NEXT: v_readlane_b32 s4, v45, 0 ; GFX10-NEXT: s_or_saveexec_b32 s5, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 +; GFX10-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:16 ; GFX10-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:20 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s5 @@ -135,17 +137,17 @@ ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v44, s33 offset:16 ; GFX11-NEXT: scratch_store_b32 off, v45, s33 offset:20 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_dual_mov_b32 v36, v16 :: v_dual_mov_b32 v35, v15 ; GFX11-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v33, v13 ; GFX11-NEXT: v_mov_b32_e32 v32, v12 ; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:12 -; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:8 -; GFX11-NEXT: scratch_store_b32 off, v43, s33 offset:4 -; GFX11-NEXT: scratch_store_b32 off, v44, s33 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v43, s33 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: ;;#ASMSTART @@ -154,30 +156,31 @@ ; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: image_gather4_c_b_cl v[41:44], v[32:36], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX11-NEXT: image_gather4_c_b_cl v[40:43], v[32:36], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX11-NEXT: s_add_i32 s32, s32, 32 ; GFX11-NEXT: v_writelane_b32 v45, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, extern_func@gotpcrel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, extern_func@gotpcrel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: ; implicit-def: $vgpr44 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: v_writelane_b32 v44, s30, 0 +; GFX11-NEXT: v_writelane_b32 v44, s31, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: v_dual_mov_b32 v0, v41 :: v_dual_mov_b32 v1, v42 -; GFX11-NEXT: v_dual_mov_b32 v2, v43 :: v_dual_mov_b32 v3, v44 +; GFX11-NEXT: v_dual_mov_b32 v0, v40 :: v_dual_mov_b32 v1, v41 +; GFX11-NEXT: v_dual_mov_b32 v2, v42 :: v_dual_mov_b32 v3, v43 ; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: scratch_load_b32 v44, off, s33 -; GFX11-NEXT: scratch_load_b32 v43, off, s33 offset:4 -; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:8 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:12 -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: scratch_load_b32 v43, off, s33 +; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:8 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:12 +; GFX11-NEXT: v_readlane_b32 s31, v44, 1 +; GFX11-NEXT: v_readlane_b32 s30, v44, 0 ; GFX11-NEXT: v_readlane_b32 s0, v45, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:16 +; GFX11-NEXT: scratch_load_b32 v44, off, s33 offset:16 ; GFX11-NEXT: scratch_load_b32 v45, off, s33 offset:20 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_addk_i32 s32, 0xffe0 @@ -215,44 +218,45 @@ ; GFX9-NEXT: s_mov_b32 s4, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v45, v16 -; GFX9-NEXT: v_mov_b32_e32 v44, v15 -; GFX9-NEXT: v_mov_b32_e32 v43, v14 -; GFX9-NEXT: v_mov_b32_e32 v42, v13 -; GFX9-NEXT: v_mov_b32_e32 v41, v12 -; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[41:45], s[4:11], s[4:7] dmask:0x1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v44, v16 +; GFX9-NEXT: v_mov_b32_e32 v43, v15 +; GFX9-NEXT: v_mov_b32_e32 v42, v14 +; GFX9-NEXT: v_mov_b32_e32 v41, v13 +; GFX9-NEXT: v_mov_b32_e32 v40, v12 +; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[40:44], s[4:11], s[4:7] dmask:0x1 ; GFX9-NEXT: s_addk_i32 s32, 0x800 ; GFX9-NEXT: v_writelane_b32 v46, s4, 0 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: v_writelane_b32 v45, s30, 0 +; GFX9-NEXT: v_writelane_b32 v45, s31, 1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[41:45], s[4:11], s[4:7] dmask:0x1 +; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[40:44], s[4:11], s[4:7] dmask:0x1 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 -; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s31, v45, 1 +; GFX9-NEXT: v_readlane_b32 s30, v45, 0 ; GFX9-NEXT: v_readlane_b32 s4, v46, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: s_addk_i32 s32, 0xf800 @@ -267,46 +271,47 @@ ; GFX10-NEXT: s_mov_b32 s4, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s5, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s5 -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: image_gather4_c_b_cl v[0:3], v[12:16], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_addk_i32 s32, 0x400 ; GFX10-NEXT: v_writelane_b32 v46, s4, 0 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: ; implicit-def: $vgpr45 +; GFX10-NEXT: v_mov_b32_e32 v40, v16 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v41, v16 -; GFX10-NEXT: v_mov_b32_e32 v42, v15 -; GFX10-NEXT: v_mov_b32_e32 v43, v14 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 -; GFX10-NEXT: v_mov_b32_e32 v44, v13 -; GFX10-NEXT: v_mov_b32_e32 v45, v12 +; GFX10-NEXT: v_writelane_b32 v45, s30, 0 +; GFX10-NEXT: v_mov_b32_e32 v41, v15 +; GFX10-NEXT: v_mov_b32_e32 v42, v14 +; GFX10-NEXT: v_mov_b32_e32 v43, v13 +; GFX10-NEXT: v_mov_b32_e32 v44, v12 +; GFX10-NEXT: v_writelane_b32 v45, s31, 1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: image_gather4_c_b_cl v[0:3], [v45, v44, v43, v42, v41], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: image_gather4_c_b_cl v[0:3], [v44, v43, v42, v41, v40], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_clause 0x4 -; GFX10-NEXT: buffer_load_dword v45, off, s[0:3], s33 -; GFX10-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:4 -; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:8 -; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:12 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:16 -; GFX10-NEXT: v_readlane_b32 s31, v40, 1 -; GFX10-NEXT: v_readlane_b32 s30, v40, 0 +; GFX10-NEXT: buffer_load_dword v44, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 +; GFX10-NEXT: v_readlane_b32 s31, v45, 1 +; GFX10-NEXT: v_readlane_b32 s30, v45, 0 ; GFX10-NEXT: v_readlane_b32 s4, v46, 0 ; GFX10-NEXT: s_or_saveexec_b32 s5, -1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:20 +; GFX10-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:20 ; GFX10-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:24 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s5 @@ -323,44 +328,45 @@ ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v45, s33 offset:20 ; GFX11-NEXT: scratch_store_b32 off, v46, s33 offset:24 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_clause 0x4 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:16 -; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:12 -; GFX11-NEXT: scratch_store_b32 off, v43, s33 offset:8 -; GFX11-NEXT: scratch_store_b32 off, v44, s33 offset:4 -; GFX11-NEXT: scratch_store_b32 off, v45, s33 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v43, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v44, s33 ; GFX11-NEXT: image_gather4_c_b_cl v[0:3], v[12:16], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX11-NEXT: s_add_i32 s32, s32, 32 ; GFX11-NEXT: v_writelane_b32 v46, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, extern_func@gotpcrel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, extern_func@gotpcrel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: ; implicit-def: $vgpr45 +; GFX11-NEXT: v_dual_mov_b32 v40, v16 :: v_dual_mov_b32 v41, v15 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: v_dual_mov_b32 v41, v16 :: v_dual_mov_b32 v42, v15 -; GFX11-NEXT: v_dual_mov_b32 v43, v14 :: v_dual_mov_b32 v44, v13 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 -; GFX11-NEXT: v_mov_b32_e32 v45, v12 +; GFX11-NEXT: v_writelane_b32 v45, s30, 0 +; GFX11-NEXT: v_dual_mov_b32 v42, v14 :: v_dual_mov_b32 v43, v13 +; GFX11-NEXT: v_mov_b32_e32 v44, v12 +; GFX11-NEXT: v_writelane_b32 v45, s31, 1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: image_gather4_c_b_cl v[0:3], [v45, v44, v43, v42, v41], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX11-NEXT: image_gather4_c_b_cl v[0:3], [v44, v43, v42, v41, v40], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX11-NEXT: s_clause 0x4 -; GFX11-NEXT: scratch_load_b32 v45, off, s33 -; GFX11-NEXT: scratch_load_b32 v44, off, s33 offset:4 -; GFX11-NEXT: scratch_load_b32 v43, off, s33 offset:8 -; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:12 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:16 -; GFX11-NEXT: v_readlane_b32 s31, v40, 1 -; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: scratch_load_b32 v44, off, s33 +; GFX11-NEXT: scratch_load_b32 v43, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:8 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:12 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:16 +; GFX11-NEXT: v_readlane_b32 s31, v45, 1 +; GFX11-NEXT: v_readlane_b32 s30, v45, 0 ; GFX11-NEXT: v_readlane_b32 s0, v46, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:20 +; GFX11-NEXT: scratch_load_b32 v45, off, s33 offset:20 ; GFX11-NEXT: scratch_load_b32 v46, off, s33 offset:24 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_addk_i32 s32, 0xffe0 diff --git a/llvm/test/CodeGen/AMDGPU/wwm-register-spill-during-regalloc.ll b/llvm/test/CodeGen/AMDGPU/wwm-register-spill-during-regalloc.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/wwm-register-spill-during-regalloc.ll @@ -0,0 +1,166 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -stop-after=virtregrewriter,1 --verify-machineinstrs -o - %s | FileCheck -check-prefix=WWM-SPILL %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -O0 -stop-after=regallocfast,1 --verify-machineinstrs -o - %s | FileCheck -check-prefix=WWM-SPILL-O0 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 --verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -O0 --verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN-O0 %s + +; Test whole-wave register spilling. + +; In the testcase, the return address registers (SGPR30_SGPR31) should be preserved across the call. +; Since the test limits the VGPR numbers, they are all in the call-clobber (scratch) range and RA should +; spill any VGPR borrowed for spilling SGPRs. The writelane/readlane instructions that spill/restore +; SGPRs into/from VGPR are whole-wave operations and hence the VGPRs involved in such operations require +; whole-wave spilling. + +define void @test() #0 { +; WWM-SPILL-LABEL: name: test +; WWM-SPILL: bb.0 (%ir-block.0): +; WWM-SPILL-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr30, $sgpr31, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 +; WWM-SPILL-NEXT: {{ $}} +; WWM-SPILL-NEXT: renamable $vgpr0 = IMPLICIT_DEF +; WWM-SPILL-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr30, 0, killed $vgpr0 +; WWM-SPILL-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr31, 1, killed $vgpr0 +; WWM-SPILL-NEXT: SI_SPILL_WWM_V32_SAVE killed $vgpr0, %stack.2, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) +; WWM-SPILL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 +; WWM-SPILL-NEXT: renamable $sgpr16_sgpr17 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @ext_func + 4, target-flags(amdgpu-gotprel32-hi) @ext_func + 12, implicit-def dead $scc +; WWM-SPILL-NEXT: renamable $sgpr16_sgpr17 = S_LOAD_DWORDX2_IMM killed renamable $sgpr16_sgpr17, 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) +; WWM-SPILL-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr16_sgpr17, @ext_func, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3 +; WWM-SPILL-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 +; WWM-SPILL-NEXT: renamable $vgpr0 = SI_SPILL_WWM_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) +; WWM-SPILL-NEXT: $sgpr31 = V_READLANE_B32 $vgpr0, 1 +; WWM-SPILL-NEXT: $sgpr30 = V_READLANE_B32 killed $vgpr0, 0 +; WWM-SPILL-NEXT: SI_RETURN +; +; WWM-SPILL-O0-LABEL: name: test +; WWM-SPILL-O0: bb.0 (%ir-block.0): +; WWM-SPILL-O0-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr30, $sgpr31, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 +; WWM-SPILL-O0-NEXT: {{ $}} +; WWM-SPILL-O0-NEXT: renamable $vgpr0 = IMPLICIT_DEF +; WWM-SPILL-O0-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr30, 0, $vgpr0 +; WWM-SPILL-O0-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr31, 1, $vgpr0 +; WWM-SPILL-O0-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr0, %stack.2, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) +; WWM-SPILL-O0-NEXT: renamable $vgpr0 = COPY $vgpr31 +; WWM-SPILL-O0-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 +; WWM-SPILL-O0-NEXT: renamable $sgpr16_sgpr17 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @ext_func + 4, target-flags(amdgpu-gotprel32-hi) @ext_func + 12, implicit-def dead $scc +; WWM-SPILL-O0-NEXT: renamable $sgpr16_sgpr17 = S_LOAD_DWORDX2_IMM killed renamable $sgpr16_sgpr17, 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) +; WWM-SPILL-O0-NEXT: renamable $sgpr20_sgpr21_sgpr22_sgpr23 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 +; WWM-SPILL-O0-NEXT: $vgpr31 = COPY killed renamable $vgpr0 +; WWM-SPILL-O0-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY killed renamable $sgpr20_sgpr21_sgpr22_sgpr23 +; WWM-SPILL-O0-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr16_sgpr17, @ext_func, csr_amdgpu, implicit killed $sgpr4_sgpr5, implicit killed $sgpr6_sgpr7, implicit killed $sgpr8_sgpr9, implicit killed $sgpr10_sgpr11, implicit killed $sgpr12, implicit killed $sgpr13, implicit killed $sgpr14, implicit killed $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3 +; WWM-SPILL-O0-NEXT: $vgpr0 = SI_SPILL_WWM_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) +; WWM-SPILL-O0-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 +; WWM-SPILL-O0-NEXT: dead $sgpr31 = V_READLANE_B32 $vgpr0, 1 +; WWM-SPILL-O0-NEXT: dead $sgpr30 = V_READLANE_B32 killed $vgpr0, 0 +; WWM-SPILL-O0-NEXT: SI_RETURN +; +; GCN-LABEL: test: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s16, s33 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_xor_saveexec_b64 s[18:19], -1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[18:19] +; GCN-NEXT: v_mov_b32_e32 v1, s34 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v1, s35 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v1, s16 +; GCN-NEXT: v_writelane_b32 v0, s30, 0 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: s_addk_i32 s32, 0x800 +; GCN-NEXT: v_writelane_b32 v0, s31, 1 +; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[34:35] +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, ext_func@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, ext_func@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[34:35] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readlane_b32 s31, v0, 1 +; GCN-NEXT: v_readlane_b32 s30, v0, 0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readfirstlane_b32 s34, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readfirstlane_b32 s35, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readfirstlane_b32 s4, v0 +; GCN-NEXT: s_xor_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_addk_i32 s32, 0xf800 +; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GCN-O0-LABEL: test: +; GCN-O0: ; %bb.0: +; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-O0-NEXT: s_mov_b32 s16, s33 +; GCN-O0-NEXT: s_mov_b32 s33, s32 +; GCN-O0-NEXT: s_xor_saveexec_b64 s[18:19], -1 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[18:19] +; GCN-O0-NEXT: v_mov_b32_e32 v1, s34 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_mov_b32_e32 v1, s35 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_mov_b32_e32 v1, s16 +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_add_i32 s32, s32, 0x800 +; GCN-O0-NEXT: ; implicit-def: $vgpr0 +; GCN-O0-NEXT: v_writelane_b32 v0, s30, 0 +; GCN-O0-NEXT: v_writelane_b32 v0, s31, 1 +; GCN-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_mov_b64 exec, s[34:35] +; GCN-O0-NEXT: v_mov_b32_e32 v0, v31 +; GCN-O0-NEXT: s_getpc_b64 s[16:17] +; GCN-O0-NEXT: s_add_u32 s16, s16, ext_func@gotpcrel32@lo+4 +; GCN-O0-NEXT: s_addc_u32 s17, s17, ext_func@gotpcrel32@hi+12 +; GCN-O0-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GCN-O0-NEXT: s_mov_b64 s[22:23], s[2:3] +; GCN-O0-NEXT: s_mov_b64 s[20:21], s[0:1] +; GCN-O0-NEXT: v_mov_b32_e32 v31, v0 +; GCN-O0-NEXT: s_mov_b64 s[0:1], s[20:21] +; GCN-O0-NEXT: s_mov_b64 s[2:3], s[22:23] +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GCN-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[34:35] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readlane_b32 s31, v0, 1 +; GCN-O0-NEXT: v_readlane_b32 s30, v0, 0 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readfirstlane_b32 s34, v0 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readfirstlane_b32 s35, v0 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_readfirstlane_b32 s4, v0 +; GCN-O0-NEXT: s_xor_saveexec_b64 s[6:7], -1 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] +; GCN-O0-NEXT: s_add_i32 s32, s32, 0xfffff800 +; GCN-O0-NEXT: s_mov_b32 s33, s4 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: s_setpc_b64 s[30:31] + call void @ext_func() + ret void +} + +declare void @ext_func(); + +attributes #0 = { nounwind "amdgpu-num-vgpr"="4" } diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll @@ -133,10 +133,12 @@ ; GFX9-O0: ; %bb.0: ; %entry ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-O0-NEXT: s_mov_b32 s36, s4 ; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39 ; GFX9-O0-NEXT: s_mov_b32 s37, s5 @@ -144,16 +146,17 @@ ; GFX9-O0-NEXT: s_mov_b32 s39, s7 ; GFX9-O0-NEXT: s_mov_b64 s[42:43], s[38:39] ; GFX9-O0-NEXT: s_mov_b64 s[40:41], s[36:37] -; GFX9-O0-NEXT: v_writelane_b32 v3, s40, 0 -; GFX9-O0-NEXT: v_writelane_b32 v3, s41, 1 -; GFX9-O0-NEXT: v_writelane_b32 v3, s42, 2 -; GFX9-O0-NEXT: v_writelane_b32 v3, s43, 3 +; GFX9-O0-NEXT: ; implicit-def: $vgpr0 +; GFX9-O0-NEXT: v_writelane_b32 v0, s40, 0 +; GFX9-O0-NEXT: v_writelane_b32 v0, s41, 1 +; GFX9-O0-NEXT: v_writelane_b32 v0, s42, 2 +; GFX9-O0-NEXT: v_writelane_b32 v0, s43, 3 ; GFX9-O0-NEXT: s_mov_b32 s34, 0 ; GFX9-O0-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], s34 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-O0-NEXT: s_not_b64 exec, exec @@ -166,22 +169,25 @@ ; GFX9-O0-NEXT: v_add_u32_e64 v1, v1, v2 ; GFX9-O0-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[36:37], v0, s34 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, s34 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[36:37], v3, s34 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, s34 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[34:35], exec -; GFX9-O0-NEXT: v_writelane_b32 v3, s34, 4 -; GFX9-O0-NEXT: v_writelane_b32 v3, s35, 5 +; GFX9-O0-NEXT: v_writelane_b32 v0, s34, 4 +; GFX9-O0-NEXT: v_writelane_b32 v0, s35, 5 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[44:45], -1 +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 exec, s[44:45] ; GFX9-O0-NEXT: s_and_b64 s[34:35], s[34:35], s[36:37] ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_cbranch_execz .LBB1_2 ; GFX9-O0-NEXT: ; %bb.1: ; %if -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] @@ -194,20 +200,23 @@ ; GFX9-O0-NEXT: v_add_u32_e64 v1, v2, v1 ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: .LBB1_2: ; %merge -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_or_saveexec_b64 s[44:45], -1 ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: v_readlane_b32 s34, v3, 4 -; GFX9-O0-NEXT: v_readlane_b32 s35, v3, 5 -; GFX9-O0-NEXT: s_or_b64 exec, exec, s[34:35] -; GFX9-O0-NEXT: v_readlane_b32 s36, v3, 0 -; GFX9-O0-NEXT: v_readlane_b32 s37, v3, 1 -; GFX9-O0-NEXT: v_readlane_b32 s38, v3, 2 -; GFX9-O0-NEXT: v_readlane_b32 s39, v3, 3 +; GFX9-O0-NEXT: s_mov_b64 exec, s[44:45] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[34:35], v0, v4 +; GFX9-O0-NEXT: v_readlane_b32 s34, v4, 4 +; GFX9-O0-NEXT: v_readlane_b32 s35, v4, 5 +; GFX9-O0-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX9-O0-NEXT: v_readlane_b32 s36, v4, 0 +; GFX9-O0-NEXT: v_readlane_b32 s37, v4, 1 +; GFX9-O0-NEXT: v_readlane_b32 s38, v4, 2 +; GFX9-O0-NEXT: v_readlane_b32 s39, v4, 3 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[34:35], v0, v3 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[34:35] ; GFX9-O0-NEXT: s_mov_b32 s34, 1 ; GFX9-O0-NEXT: v_lshlrev_b32_e64 v0, s34, v0 @@ -216,9 +225,10 @@ ; GFX9-O0-NEXT: s_mov_b32 s34, 0 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[36:39], s34 offset:4 ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] @@ -335,13 +345,17 @@ ; GFX9-O0-NEXT: s_mov_b32 s35, s33 ; GFX9-O0-NEXT: s_mov_b32 s33, s32 ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[36:37], -1 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-O0-NEXT: s_add_i32 s32, s32, 0x400 -; GFX9-O0-NEXT: v_writelane_b32 v3, s30, 0 -; GFX9-O0-NEXT: v_writelane_b32 v3, s31, 1 +; GFX9-O0-NEXT: s_add_i32 s32, s32, 0x800 +; GFX9-O0-NEXT: ; implicit-def: $vgpr0 +; GFX9-O0-NEXT: v_writelane_b32 v0, s30, 0 +; GFX9-O0-NEXT: v_writelane_b32 v0, s31, 1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[48:49], -1 +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 exec, s[48:49] ; GFX9-O0-NEXT: s_mov_b32 s36, s4 ; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39 ; GFX9-O0-NEXT: s_mov_b32 s37, s5 @@ -364,18 +378,22 @@ ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[42:43] ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[48:49], -1 +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[48:49] ; GFX9-O0-NEXT: v_add_u32_e64 v1, v1, v2 ; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[36:39], s34 offset:4 -; GFX9-O0-NEXT: v_readlane_b32 s31, v3, 1 -; GFX9-O0-NEXT: v_readlane_b32 s30, v3, 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[36:39], s34 offset:4 +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: v_readlane_b32 s31, v0, 1 +; GFX9-O0-NEXT: v_readlane_b32 s30, v0, 0 ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[36:37], -1 -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffffc00 +; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffff800 ; GFX9-O0-NEXT: s_mov_b32 s33, s35 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] @@ -390,8 +408,9 @@ ; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-O3-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O3-NEXT: v_writelane_b32 v3, s30, 0 +; GFX9-O3-NEXT: ; implicit-def: $vgpr3 ; GFX9-O3-NEXT: s_addk_i32 s32, 0x400 +; GFX9-O3-NEXT: v_writelane_b32 v3, s30, 0 ; GFX9-O3-NEXT: v_writelane_b32 v3, s31, 1 ; GFX9-O3-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-O3-NEXT: s_not_b64 exec, exec @@ -516,37 +535,39 @@ ; GFX9-O0-LABEL: strict_wwm_call_i64: ; GFX9-O0: ; %bb.0: ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-O0-NEXT: s_mov_b32 s42, s33 +; GFX9-O0-NEXT: s_mov_b32 s44, s33 ; GFX9-O0-NEXT: s_mov_b32 s33, s32 ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xc00 -; GFX9-O0-NEXT: v_writelane_b32 v10, s30, 0 -; GFX9-O0-NEXT: v_writelane_b32 v10, s31, 1 +; GFX9-O0-NEXT: s_add_i32 s32, s32, 0x1000 +; GFX9-O0-NEXT: ; implicit-def: $vgpr0 +; GFX9-O0-NEXT: v_writelane_b32 v0, s30, 0 +; GFX9-O0-NEXT: v_writelane_b32 v0, s31, 1 ; GFX9-O0-NEXT: s_mov_b32 s34, s8 ; GFX9-O0-NEXT: s_mov_b32 s36, s4 ; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39 ; GFX9-O0-NEXT: s_mov_b32 s37, s5 ; GFX9-O0-NEXT: s_mov_b32 s38, s6 ; GFX9-O0-NEXT: s_mov_b32 s39, s7 -; GFX9-O0-NEXT: v_writelane_b32 v10, s36, 2 -; GFX9-O0-NEXT: v_writelane_b32 v10, s37, 3 -; GFX9-O0-NEXT: v_writelane_b32 v10, s38, 4 -; GFX9-O0-NEXT: v_writelane_b32 v10, s39, 5 +; GFX9-O0-NEXT: v_writelane_b32 v0, s36, 2 +; GFX9-O0-NEXT: v_writelane_b32 v0, s37, 3 +; GFX9-O0-NEXT: v_writelane_b32 v0, s38, 4 +; GFX9-O0-NEXT: v_writelane_b32 v0, s39, 5 ; GFX9-O0-NEXT: ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35 ; GFX9-O0-NEXT: s_mov_b32 s35, s9 ; GFX9-O0-NEXT: ; kill: def $sgpr40_sgpr41 killed $sgpr34_sgpr35 @@ -558,8 +579,11 @@ ; GFX9-O0-NEXT: v_mov_b32_e32 v9, s37 ; GFX9-O0-NEXT: s_not_b64 exec, exec ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: v_writelane_b32 v10, s34, 6 -; GFX9-O0-NEXT: v_writelane_b32 v10, s35, 7 +; GFX9-O0-NEXT: v_writelane_b32 v0, s34, 6 +; GFX9-O0-NEXT: v_writelane_b32 v0, s35, 7 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[42:43], -1 +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 exec, s[42:43] ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v8 ; GFX9-O0-NEXT: s_mov_b32 s34, 32 ; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37 @@ -576,13 +600,20 @@ ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-O0-NEXT: v_readlane_b32 s34, v10, 6 -; GFX9-O0-NEXT: v_readlane_b32 s35, v10, 7 -; GFX9-O0-NEXT: v_readlane_b32 s36, v10, 2 -; GFX9-O0-NEXT: v_readlane_b32 s37, v10, 3 -; GFX9-O0-NEXT: v_readlane_b32 s38, v10, 4 -; GFX9-O0-NEXT: v_readlane_b32 s39, v10, 5 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[42:43], -1 +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[42:43] +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_readlane_b32 s34, v6, 6 +; GFX9-O0-NEXT: v_readlane_b32 s35, v6, 7 +; GFX9-O0-NEXT: v_readlane_b32 s36, v6, 2 +; GFX9-O0-NEXT: v_readlane_b32 s37, v6, 3 +; GFX9-O0-NEXT: v_readlane_b32 s38, v6, 4 +; GFX9-O0-NEXT: v_readlane_b32 s39, v6, 5 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[42:43], -1 +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[42:43] ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O0-NEXT: ; implicit-def: $sgpr40 ; GFX9-O0-NEXT: ; implicit-def: $sgpr40 @@ -592,36 +623,38 @@ ; GFX9-O0-NEXT: v_add_co_u32_e64 v2, s[40:41], v2, v4 ; GFX9-O0-NEXT: v_addc_co_u32_e64 v3, s[40:41], v3, v5, s[40:41] ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-O0-NEXT: s_mov_b32 s34, 0 -; GFX9-O0-NEXT: buffer_store_dwordx2 v[0:1], off, s[36:39], s34 offset:4 -; GFX9-O0-NEXT: v_readlane_b32 s31, v10, 1 -; GFX9-O0-NEXT: v_readlane_b32 s30, v10, 0 +; GFX9-O0-NEXT: buffer_store_dwordx2 v[6:7], off, s[36:39], s34 offset:4 +; GFX9-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-O0-NEXT: v_readlane_b32 s31, v0, 1 +; GFX9-O0-NEXT: v_readlane_b32 s30, v0, 0 ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffff400 -; GFX9-O0-NEXT: s_mov_b32 s33, s42 +; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffff000 +; GFX9-O0-NEXT: s_mov_b32 s33, s44 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] ; @@ -641,8 +674,9 @@ ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O3-NEXT: v_writelane_b32 v8, s30, 0 +; GFX9-O3-NEXT: ; implicit-def: $vgpr8 ; GFX9-O3-NEXT: s_addk_i32 s32, 0x800 +; GFX9-O3-NEXT: v_writelane_b32 v8, s30, 0 ; GFX9-O3-NEXT: v_writelane_b32 v8, s31, 1 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O3-NEXT: s_getpc_b64 s[36:37] diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll @@ -37,6 +37,7 @@ ; AFTER-PEI-NEXT: occupancy: 5 ; AFTER-PEI-NEXT: scavengeFI: '%fixed-stack.0' ; AFTER-PEI-NEXT: vgprForAGPRCopy: '' +; AFTER-PEI-NEXT: sgprForEXECCopy: '' ; AFTER-PEI-NEXT: body: define amdgpu_kernel void @scavenge_fi(i32 addrspace(1)* %out, i32 %in) #0 { %wide.sgpr0 = call <32 x i32> asm sideeffect "; def $0", "=s" () #0 diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir @@ -46,6 +46,7 @@ # FULL-NEXT: highBitsOf32BitAddress: 0 # FULL-NEXT: occupancy: 10 # FULL-NEXT: vgprForAGPRCopy: '' +# FULL-NEXT: sgprForEXECCopy: '' # FULL-NEXT: body: # SIMPLE: machineFunctionInfo: @@ -144,6 +145,7 @@ # FULL-NEXT: highBitsOf32BitAddress: 0 # FULL-NEXT: occupancy: 10 # FULL-NEXT: vgprForAGPRCopy: '' +# FULL-NEXT: sgprForEXECCopy: '' # FULL-NEXT: body: # SIMPLE: machineFunctionInfo: @@ -213,6 +215,7 @@ # FULL-NEXT: highBitsOf32BitAddress: 0 # FULL-NEXT: occupancy: 10 # FULL-NEXT: vgprForAGPRCopy: '' +# FULL-NEXT: sgprForEXECCopy: '' # FULL-NEXT: body: # SIMPLE: machineFunctionInfo: @@ -283,6 +286,7 @@ # FULL-NEXT: highBitsOf32BitAddress: 0 # FULL-NEXT: occupancy: 10 # FULL-NEXT: vgprForAGPRCopy: '' +# FULL-NEXT: sgprForEXECCopy: '' # FULL-NEXT: body: # SIMPLE: machineFunctionInfo: @@ -529,3 +533,28 @@ SI_RETURN ... + +--- +# ALL-LABEL: name: sgpr_for_exec_copy +# ALL: sgprForEXECCopy: '$sgpr2_sgpr3' +name: sgpr_for_exec_copy +machineFunctionInfo: + sgprForEXECCopy: '$sgpr2_sgpr3' +body: | + bb.0: + SI_RETURN + +... + +--- +# ALL-LABEL: name: sgpr_for_exec_copy_noreg +# FULL: sgprForEXECCopy: '' +# SIMPLE-NOT: sgprForEXECCopy +name: sgpr_for_exec_copy_noreg +machineFunctionInfo: + sgprForEXECCopy: '$noreg' +body: | + bb.0: + SI_RETURN + +... diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll @@ -40,6 +40,7 @@ ; CHECK-NEXT: highBitsOf32BitAddress: 0 ; CHECK-NEXT: occupancy: 10 ; CHECK-NEXT: vgprForAGPRCopy: '' +; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101' ; CHECK-NEXT: body: define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) { %gep = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %arg0 @@ -80,6 +81,7 @@ ; CHECK-NEXT: highBitsOf32BitAddress: 0 ; CHECK-NEXT: occupancy: 10 ; CHECK-NEXT: vgprForAGPRCopy: '' +; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101' ; CHECK-NEXT: body: define amdgpu_ps void @ps_shader(i32 %arg0, i32 inreg %arg1) { %gep = getelementptr inbounds [128 x i32], [128 x i32] addrspace(2)* @gds, i32 0, i32 %arg0 @@ -134,6 +136,7 @@ ; CHECK-NEXT: highBitsOf32BitAddress: 0 ; CHECK-NEXT: occupancy: 10 ; CHECK-NEXT: vgprForAGPRCopy: '' +; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101' ; CHECK-NEXT: body: define void @function() { ret void @@ -180,6 +183,7 @@ ; CHECK-NEXT: highBitsOf32BitAddress: 0 ; CHECK-NEXT: occupancy: 10 ; CHECK-NEXT: vgprForAGPRCopy: '' +; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101' ; CHECK-NEXT: body: define void @function_nsz() #0 { ret void diff --git a/llvm/test/CodeGen/MIR/AMDGPU/sgpr-for-exec-copy-invalid-reg.mir b/llvm/test/CodeGen/MIR/AMDGPU/sgpr-for-exec-copy-invalid-reg.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/MIR/AMDGPU/sgpr-for-exec-copy-invalid-reg.mir @@ -0,0 +1,12 @@ +# RUN: not llc -mtriple=amdgcn-amd-amdhsa -run-pass=none -verify-machineinstrs %s -o /dev/null 2>&1 | FileCheck -check-prefix=ERR %s + +--- +name: invalid_reg +machineFunctionInfo: +# ERR: [[@LINE+1]]:21: unknown register name 'srst' + sgprForEXECCopy: '$srst' +body: | + bb.0: + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/MIR/AMDGPU/stack-id-assert.mir b/llvm/test/CodeGen/MIR/AMDGPU/stack-id-assert.mir --- a/llvm/test/CodeGen/MIR/AMDGPU/stack-id-assert.mir +++ b/llvm/test/CodeGen/MIR/AMDGPU/stack-id-assert.mir @@ -3,7 +3,7 @@ # contains not dead objects only. So using objects IDs as offset in the storage # caused out of bounds access. -# RUN: llc -march=amdgcn -run-pass=si-lower-sgpr-spills,prologepilog -verify-machineinstrs -o - %s | FileCheck %s +# RUN: llc -march=amdgcn -start-before=si-lower-sgpr-spills -stop-after=prologepilog -verify-machineinstrs -o - %s | FileCheck %s # CHECK-LABEL: name: foo # CHECK: {{^}}fixedStack: []