diff --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h --- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h +++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h @@ -57,7 +57,7 @@ virtual ~Delegate() = default; virtual void MRI_NoteNewVirtualRegister(Register Reg) = 0; - virtual void MRI_NotecloneVirtualRegister(Register NewReg, + virtual void MRI_NoteCloneVirtualRegister(Register NewReg, Register SrcReg) { MRI_NoteNewVirtualRegister(NewReg); } @@ -181,7 +181,7 @@ void noteCloneVirtualRegister(Register NewReg, Register SrcReg) { for (auto *TheDelegate : TheDelegates) - TheDelegate->MRI_NotecloneVirtualRegister(NewReg, SrcReg); + TheDelegate->MRI_NoteCloneVirtualRegister(NewReg, SrcReg); } //===--------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -92,6 +92,8 @@ return true; } + void registerMachineRegisterInfoCallback(MachineFunction &MF) const override; + MachineFunctionInfo * createMachineFunctionInfo(BumpPtrAllocator &Allocator, const Function &F, const TargetSubtargetInfo *STI) const override; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -1427,6 +1427,12 @@ return new GCNPassConfig(*this, PM); } +void GCNTargetMachine::registerMachineRegisterInfoCallback( + MachineFunction &MF) const { + SIMachineFunctionInfo *MFI = MF.getInfo(); + MF.getRegInfo().addDelegate(MFI); +} + MachineFunctionInfo *GCNTargetMachine::createMachineFunctionInfo( BumpPtrAllocator &Allocator, const Function &F, const TargetSubtargetInfo *STI) const { @@ -1481,6 +1487,9 @@ if (parseOptionalRegister(YamlMFI.VGPRForAGPRCopy, MFI->VGPRForAGPRCopy)) return true; + if (parseOptionalRegister(YamlMFI.SGPRForEXECCopy, MFI->SGPRForEXECCopy)) + return true; + if (parseOptionalRegister(YamlMFI.LongBranchReservedReg, MFI->LongBranchReservedReg)) return true; diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -926,6 +926,17 @@ }; } // namespace ImplicitArg + +namespace VirtRegFlag { +// Virtual register flags used for various target specific handlings during +// codegen. +enum Register_Flag : uint8_t { + // Register operand in a whole-wave mode operation. + WWM_REG = 1 << 0, +}; + +} // namespace VirtRegFlag + } // namespace AMDGPU #define R_00B028_SPI_SHADER_PGM_RSRC1_PS 0x00B028 diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h @@ -34,8 +34,8 @@ RegScavenger *RS = nullptr) const override; void determineCalleeSavesSGPR(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS = nullptr) const; - void determinePrologEpilogSGPRSaves(MachineFunction &MF, - BitVector &SavedRegs) const; + void determinePrologEpilogSGPRSaves(MachineFunction &MF, BitVector &SavedRegs, + bool NeedExecCopyReservedReg) const; void emitCSRSpillStores(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc &DL, LivePhysRegs &LiveRegs, Register FrameReg, diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -64,9 +64,12 @@ return MCRegister(); } +/// Query target location for spilling SGPRs +/// \p IncludeScratchCopy : Also look for free scratch SGPRs static void getVGPRSpillLaneOrTempRegister( MachineFunction &MF, LivePhysRegs &LiveRegs, Register SGPR, - const TargetRegisterClass &RC = AMDGPU::SReg_32_XM0_XEXECRegClass) { + const TargetRegisterClass &RC = AMDGPU::SReg_32_XM0_XEXECRegClass, + bool IncludeScratchCopy = true) { SIMachineFunctionInfo *MFI = MF.getInfo(); MachineFrameInfo &FrameInfo = MF.getFrameInfo(); @@ -77,9 +80,12 @@ // We need to save and restore the given SGPR. + Register ScratchSGPR; // 1: Try to save the given register into an unused scratch SGPR. The LiveRegs - // should have all the callee saved registers marked as used. - Register ScratchSGPR = findUnusedRegister(MF.getRegInfo(), LiveRegs, RC); + // should have all the callee saved registers marked as used. For certain + // cases we skip copy to scratch SGPR. + if (IncludeScratchCopy) + ScratchSGPR = findUnusedRegister(MF.getRegInfo(), LiveRegs, RC); if (!ScratchSGPR) { int FI = FrameInfo.CreateStackObject(Size, Alignment, true, nullptr, @@ -935,8 +941,7 @@ if (!WWMCalleeSavedRegs.empty()) { if (ScratchExecCopy) { unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1); + BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1); } else { ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, DL, /*IsProlog*/ true, @@ -948,8 +953,7 @@ if (ScratchExecCopy) { // FIXME: Split block and make terminator. unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec) + BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec()) .addReg(ScratchExecCopy, RegState::Kill); LiveRegs.addReg(ScratchExecCopy); } @@ -1040,8 +1044,7 @@ if (!WWMCalleeSavedRegs.empty()) { if (ScratchExecCopy) { unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1); + BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1); } else { ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, DL, /*IsProlog*/ false, @@ -1053,8 +1056,7 @@ if (ScratchExecCopy) { // FIXME: Split block and make terminator. unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec) + BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec()) .addReg(ScratchExecCopy, RegState::Kill); } } @@ -1463,8 +1465,10 @@ // The special SGPR spills like the one needed for FP, BP or any reserved // registers delayed until frame lowering. void SIFrameLowering::determinePrologEpilogSGPRSaves( - MachineFunction &MF, BitVector &SavedVGPRs) const { + MachineFunction &MF, BitVector &SavedVGPRs, + bool NeedExecCopyReservedReg) const { MachineFrameInfo &FrameInfo = MF.getFrameInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); SIMachineFunctionInfo *MFI = MF.getInfo(); const GCNSubtarget &ST = MF.getSubtarget(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); @@ -1476,6 +1480,26 @@ for (unsigned I = 0; CSRegs[I]; ++I) LiveRegs.addReg(CSRegs[I]); + const TargetRegisterClass &RC = *TRI->getWaveMaskRegClass(); + + if (NeedExecCopyReservedReg) { + Register ReservedReg = MFI->getSGPRForEXECCopy(); + assert(ReservedReg && "Should have reserved an SGPR for EXEC copy."); + Register UnusedScratchReg = findUnusedRegister(MRI, LiveRegs, RC); + if (UnusedScratchReg) { + // If found any unused scratch SGPR, reserve the register itself for Exec + // copy and there is no need for any spills in that case. + MFI->setSGPRForEXECCopy(UnusedScratchReg); + LiveRegs.addReg(UnusedScratchReg); + } else { + // Needs spill. + assert(!MFI->hasPrologEpilogSGPRSpillEntry(ReservedReg) && + "Re-reserving spill slot for EXEC copy register"); + getVGPRSpillLaneOrTempRegister(MF, LiveRegs, ReservedReg, RC, + /*IncludeScratchCopy=*/false); + } + } + // hasFP only knows about stack objects that already exist. We're now // determining the stack slots that will be created, so we have to predict // them. Stack objects force FP usage with calls. @@ -1514,6 +1538,8 @@ const GCNSubtarget &ST = MF.getSubtarget(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const SIInstrInfo *TII = ST.getInstrInfo(); + bool NeedExecCopyReservedReg = false; MachineInstr *ReturnMI = nullptr; for (MachineBasicBlock &MBB : MF) { @@ -1532,6 +1558,8 @@ MFI->allocateWWMSpill(MF, MI.getOperand(0).getReg()); else if (MI.getOpcode() == AMDGPU::V_READLANE_B32) MFI->allocateWWMSpill(MF, MI.getOperand(1).getReg()); + else if (TII->isWWMRegSpillOpcode(MI.getOpcode())) + NeedExecCopyReservedReg = true; else if (MI.getOpcode() == AMDGPU::SI_RETURN || MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) { // We expect all return to be the same size. @@ -1561,7 +1589,7 @@ if (!ST.hasGFX90AInsts()) SavedVGPRs.clearBitsInMask(TRI->getAllAGPRRegMask()); - determinePrologEpilogSGPRSaves(MF, SavedVGPRs); + determinePrologEpilogSGPRSaves(MF, SavedVGPRs, NeedExecCopyReservedReg); // The Whole-Wave VGPRs need to be specially inserted in the prolog, so don't // allow the default insertion to handle them. diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -13395,6 +13395,15 @@ reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info); } + // TODO: Move this logic to getReservedRegs() + // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling. + unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF); + Register SReg = ST.isWave32() + ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1) + : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2, + &AMDGPU::SGPR_64RegClass); + Info->setSGPRForEXECCopy(SReg); + assert(!TRI->isSubRegister(Info->getScratchRSrcReg(), Info->getStackPtrOffsetReg())); if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -654,6 +654,11 @@ return get(Opcode).TSFlags & SIInstrFlags::SGPRSpill; } + static bool isWWMRegSpillOpcode(uint16_t Opcode) { + return Opcode == AMDGPU::SI_SPILL_WWM_V32_SAVE || + Opcode == AMDGPU::SI_SPILL_WWM_V32_RESTORE; + } + static bool isDPP(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::DPP; } @@ -939,6 +944,15 @@ unsigned getVALUOp(const MachineInstr &MI) const; + void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, Register Reg, + bool IsSCCLive) const; + + void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, const DebugLoc &DL, + Register Reg) const; + /// Return the correct register class for \p OpNo. For target-specific /// instructions, this will return the register class that has been defined /// in tablegen. For generic instructions, like REG_SEQUENCE it will return diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1587,6 +1587,30 @@ } } +static unsigned getWWMRegSpillSaveOpcode(unsigned Size) { + // Currently, there is only 32-bit WWM register spills needed. + if (Size != 4) + llvm_unreachable("unknown wwm register spill size"); + + return AMDGPU::SI_SPILL_WWM_V32_SAVE; +} + +static unsigned getVectorRegSpillSaveOpcode(Register Reg, + const TargetRegisterClass *RC, + unsigned Size, + const SIRegisterInfo &TRI, + const SIMachineFunctionInfo &MFI) { + // Choose the right opcode if spilling a WWM register. + if (MFI.checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG)) + return getWWMRegSpillSaveOpcode(Size); + + if (TRI.isVectorSuperClass(RC)) + return getAVSpillSaveOpcode(Size); + + return TRI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(Size) + : getVGPRSpillSaveOpcode(Size); +} + void SIInstrInfo::storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, @@ -1631,11 +1655,8 @@ return; } - unsigned Opcode = RI.isVectorSuperClass(RC) - ? getAVSpillSaveOpcode(SpillSize) - : RI.isAGPRClass(RC) - ? getAGPRSpillSaveOpcode(SpillSize) - : getVGPRSpillSaveOpcode(SpillSize); + unsigned Opcode = getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC, + SpillSize, RI, *MFI); MFI->setHasSpilledVGPRs(); BuildMI(MBB, MI, DL, get(Opcode)) @@ -1786,6 +1807,29 @@ } } +static unsigned getWWMRegSpillRestoreOpcode(unsigned Size) { + // Currently, there is only 32-bit WWM register spills needed. + if (Size != 4) + llvm_unreachable("unknown wwm register spill size"); + + return AMDGPU::SI_SPILL_WWM_V32_RESTORE; +} + +static unsigned +getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, + unsigned Size, const SIRegisterInfo &TRI, + const SIMachineFunctionInfo &MFI) { + // Choose the right opcode if restoring a WWM register. + if (MFI.checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG)) + return getWWMRegSpillRestoreOpcode(Size); + + if (TRI.isVectorSuperClass(RC)) + return getAVSpillRestoreOpcode(Size); + + return TRI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(Size) + : getVGPRSpillRestoreOpcode(Size); +} + void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, @@ -1829,11 +1873,8 @@ return; } - unsigned Opcode = RI.isVectorSuperClass(RC) - ? getAVSpillRestoreOpcode(SpillSize) - : RI.isAGPRClass(RC) - ? getAGPRSpillRestoreOpcode(SpillSize) - : getVGPRSpillRestoreOpcode(SpillSize); + unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC, + SpillSize, RI, *MFI); BuildMI(MBB, MI, DL, get(Opcode), DestReg) .addFrameIndex(FrameIndex) // vaddr .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset @@ -4924,6 +4965,39 @@ "Unexpected scalar opcode without corresponding vector one!"); } +void SIInstrInfo::insertScratchExecCopy(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, Register Reg, + bool IsSCCLive) const { + const GCNSubtarget &ST = MF.getSubtarget(); + const SIInstrInfo *TII = ST.getInstrInfo(); + bool IsWave32 = ST.isWave32(); + if (IsSCCLive) { + // Insert two move instructions, one to save the original value of EXEC and + // the other to turn on all bits in EXEC. This is required as we can't use + // the single instruction S_OR_SAVEEXEC that clobbers SCC. + unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + MCRegister Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Reg).addReg(Exec, RegState::Kill); + BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1); + } else { + const unsigned OrSaveExec = + IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64; + auto SaveExec = + BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), Reg).addImm(-1); + SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead. + } +} + +void SIInstrInfo::restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, Register Reg) const { + unsigned ExecMov = isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + MCRegister Exec = isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + BuildMI(MBB, MBBI, DL, get(ExecMov), Exec).addReg(Reg, RegState::Kill); +} + static const TargetRegisterClass * adjustAllocatableRegClass(const GCNSubtarget &ST, const SIRegisterInfo &RI, const MachineRegisterInfo &MRI, diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -933,6 +933,9 @@ defm SI_SPILL_AV512 : SI_SPILL_VGPR ; defm SI_SPILL_AV1024 : SI_SPILL_VGPR ; +let isConvergent = 1 in +defm SI_SPILL_WWM_V32 : SI_SPILL_VGPR ; + def SI_PC_ADD_REL_OFFSET : SPseudoInstSI < (outs SReg_64:$dst), (ins si_ga:$ptr_lo, si_ga:$ptr_hi), diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp --- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp @@ -334,7 +334,20 @@ // lane". FuncInfo->removeDeadFrameIndices(MFI, /*ResetSGPRSpillStackIDs*/ false); + const TargetRegisterClass *RC = TRI->getWaveMaskRegClass(); + // Shift back the reserved SGPR for EXEC copy into the lowest range. + // This SGPR is reserved to handle the whole-wave spill/copy operations + // that might get inserted during vgpr regalloc. + Register UnusedLowSGPR = TRI->findUnusedRegister(MRI, RC, MF); + if (UnusedLowSGPR && TRI->getHWRegIndex(UnusedLowSGPR) < + TRI->getHWRegIndex(FuncInfo->getSGPRForEXECCopy())) + FuncInfo->setSGPRForEXECCopy(UnusedLowSGPR); + MadeChange = true; + } else { + // No SGPR spills and hence there won't be any WWM spills/copies. Reset the + // SGPR reserved for EXEC copy. + FuncInfo->setSGPRForEXECCopy(AMDGPU::NoRegister); } SaveBlocks.clear(); diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -283,6 +283,7 @@ SIMode Mode; std::optional ScavengeFI; StringValue VGPRForAGPRCopy; + StringValue SGPRForEXECCopy; StringValue LongBranchReservedReg; SIMachineFunctionInfo() = default; @@ -327,6 +328,8 @@ YamlIO.mapOptional("scavengeFI", MFI.ScavengeFI); YamlIO.mapOptional("vgprForAGPRCopy", MFI.VGPRForAGPRCopy, StringValue()); // Don't print out when it's empty. + YamlIO.mapOptional("sgprForEXECCopy", MFI.SGPRForEXECCopy, + StringValue()); // Don't print out when it's empty. YamlIO.mapOptional("longBranchReservedReg", MFI.LongBranchReservedReg, StringValue()); } @@ -365,7 +368,8 @@ /// This class keeps track of the SPI_SP_INPUT_ADDR config register, which /// tells the hardware which interpolation parameters to load. -class SIMachineFunctionInfo final : public AMDGPUMachineFunction { +class SIMachineFunctionInfo final : public AMDGPUMachineFunction, + private MachineRegisterInfo::Delegate { friend class GCNTargetMachine; // State of MODE register, assumed FP mode. @@ -468,6 +472,9 @@ unsigned HighBitsOf32BitAddress; + // Flags associated with the virtual registers. + IndexedMap VRegFlags; + // Current recorded maximum possible occupancy. unsigned Occupancy; @@ -477,6 +484,10 @@ MCPhysReg getNextSystemSGPR() const; + // MachineRegisterInfo callback functions to notify events. + void MRI_NoteNewVirtualRegister(Register Reg) override; + void MRI_NoteCloneVirtualRegister(Register NewReg, Register SrcReg) override; + public: struct VGPRSpillToAGPR { SmallVector Lanes; @@ -519,6 +530,9 @@ // PrologEpilogInserter. PrologEpilogSGPRSpillsMap PrologEpilogSGPRSpills; + // To save/restore EXEC MASK around WWM spills and copies. + Register SGPRForEXECCopy; + DenseMap VGPRToAGPRSpills; // AGPRs used for VGPR spills. @@ -640,6 +654,19 @@ : ArrayRef(I->second); } + void setFlag(Register Reg, uint8_t Flag) { + assert(Reg.isVirtual()); + if (VRegFlags.inBounds(Reg)) + VRegFlags[Reg] |= Flag; + } + + bool checkFlag(Register Reg, uint8_t Flag) const { + if (Reg.isPhysical()) + return false; + + return VRegFlags.inBounds(Reg) && VRegFlags[Reg] & Flag; + } + void allocateWWMSpill(MachineFunction &MF, Register VGPR, uint64_t Size = 4, Align Alignment = Align(4)); @@ -652,6 +679,10 @@ return SpillAGPR; } + Register getSGPRForEXECCopy() const { return SGPRForEXECCopy; } + + void setSGPRForEXECCopy(Register Reg) { SGPRForEXECCopy = Reg; } + ArrayRef getVGPRSpillAGPRs() const { return SpillVGPR; } diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -65,6 +65,8 @@ Occupancy = ST.computeOccupancy(F, getLDSSize()); CallingConv::ID CC = F.getCallingConv(); + VRegFlags.reserve(1024); + // FIXME: Should have analysis or something rather than attribute to detect // calls. const bool HasCalls = F.hasFnAttribute("amdgpu-calls"); @@ -540,6 +542,16 @@ return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs; } +void SIMachineFunctionInfo::MRI_NoteNewVirtualRegister(Register Reg) { + VRegFlags.grow(Reg); +} + +void SIMachineFunctionInfo::MRI_NoteCloneVirtualRegister(Register NewReg, + Register SrcReg) { + VRegFlags.grow(NewReg); + VRegFlags[NewReg] = VRegFlags[SrcReg]; +} + Register SIMachineFunctionInfo::getGITPtrLoReg(const MachineFunction &MF) const { const GCNSubtarget &ST = MF.getSubtarget(); @@ -652,6 +664,10 @@ LongBranchReservedReg = regToString(MFI.getLongBranchReservedReg(), TRI); if (MFI.getVGPRForAGPRCopy()) VGPRForAGPRCopy = regToString(MFI.getVGPRForAGPRCopy(), TRI); + + if (MFI.getSGPRForEXECCopy()) + SGPRForEXECCopy = regToString(MFI.getSGPRForEXECCopy(), TRI); + auto SFI = MFI.getOptionalScavengeFI(); if (SFI) ScavengeFI = yaml::FrameIndex(*SFI, MF.getFrameInfo()); diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -70,6 +70,12 @@ return SpillSGPRToVGPR; } + /// Return the largest available SGPR aligned to \p Align for the register + /// class \p RC. + MCRegister getAlignedHighSGPRForRC(const MachineFunction &MF, + const unsigned Align, + const TargetRegisterClass *RC) const; + /// Return the end register initially reserved for the scratch buffer in case /// spilling is needed. MCRegister reservedPrivateSegmentBufferReg(const MachineFunction &MF) const; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -533,11 +533,18 @@ return SubRegFromChannelTable[NumRegIndex - 1][Channel]; } +MCRegister +SIRegisterInfo::getAlignedHighSGPRForRC(const MachineFunction &MF, + const unsigned Align, + const TargetRegisterClass *RC) const { + unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), Align) - Align; + MCRegister BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx)); + return getMatchingSuperReg(BaseReg, AMDGPU::sub0, RC); +} + MCRegister SIRegisterInfo::reservedPrivateSegmentBufferReg( const MachineFunction &MF) const { - unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), 4) - 4; - MCRegister BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx)); - return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SGPR_128RegClass); + return getAlignedHighSGPRForRC(MF, /*Align=*/4, &AMDGPU::SGPR_128RegClass); } BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { @@ -640,6 +647,12 @@ assert(!isSubRegister(ScratchRSrcReg, BasePtrReg)); } + // FIXME: Use same reserved register introduced in D149775 + // SGPR used to preserve EXEC MASK around WWM spill/copy instructions. + Register ExecCopyReg = MFI->getSGPRForEXECCopy(); + if (ExecCopyReg) + reserveRegisterTuples(Reserved, ExecCopyReg); + // Reserve VGPRs/AGPRs. // unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF); @@ -1053,6 +1066,8 @@ case AMDGPU::SI_SPILL_A32_RESTORE: case AMDGPU::SI_SPILL_AV32_SAVE: case AMDGPU::SI_SPILL_AV32_RESTORE: + case AMDGPU::SI_SPILL_WWM_V32_SAVE: + case AMDGPU::SI_SPILL_WWM_V32_RESTORE: return 1; default: llvm_unreachable("Invalid spill opcode"); } @@ -2125,7 +2140,8 @@ case AMDGPU::SI_SPILL_AV128_SAVE: case AMDGPU::SI_SPILL_AV96_SAVE: case AMDGPU::SI_SPILL_AV64_SAVE: - case AMDGPU::SI_SPILL_AV32_SAVE: { + case AMDGPU::SI_SPILL_AV32_SAVE: + case AMDGPU::SI_SPILL_WWM_V32_SAVE: { const MachineOperand *VData = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == @@ -2134,11 +2150,19 @@ unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR : AMDGPU::BUFFER_STORE_DWORD_OFFSET; auto *MBB = MI->getParent(); + bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode()); + if (IsWWMRegSpill) { + TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(), + RS->isRegUsed(AMDGPU::SCC)); + } buildSpillLoadStore( *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg, TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), *MI->memoperands_begin(), RS); MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode())); + if (IsWWMRegSpill) + TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy()); + MI->eraseFromParent(); return true; } @@ -2183,7 +2207,8 @@ case AMDGPU::SI_SPILL_AV352_RESTORE: case AMDGPU::SI_SPILL_AV384_RESTORE: case AMDGPU::SI_SPILL_AV512_RESTORE: - case AMDGPU::SI_SPILL_AV1024_RESTORE: { + case AMDGPU::SI_SPILL_AV1024_RESTORE: + case AMDGPU::SI_SPILL_WWM_V32_RESTORE: { const MachineOperand *VData = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == @@ -2192,10 +2217,19 @@ unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; auto *MBB = MI->getParent(); + bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode()); + if (IsWWMRegSpill) { + TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(), + RS->isRegUsed(AMDGPU::SCC)); + } buildSpillLoadStore( *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg, TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), *MI->memoperands_begin(), RS); + + if (IsWWMRegSpill) + TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy()); + MI->eraseFromParent(); return true; } diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -1371,7 +1371,7 @@ ; GCN-LABEL: test_call: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s33 +; GCN-NEXT: s_mov_b32 s10, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1395,14 +1395,14 @@ ; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s8 +; GCN-NEXT: s_mov_b32 s33, s10 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_call: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s8, s33 +; GFX7-NEXT: s_mov_b32 s10, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1425,14 +1425,14 @@ ; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 -; GFX7-NEXT: s_mov_b32 s33, s8 +; GFX7-NEXT: s_mov_b32 s33, s10 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_call: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s6, s33 +; GFX8-NEXT: s_mov_b32 s8, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1455,14 +1455,14 @@ ; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 -; GFX8-NEXT: s_mov_b32 s33, s6 +; GFX8-NEXT: s_mov_b32 s33, s8 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_call: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s6, s33 +; GFX9-NEXT: s_mov_b32 s8, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1484,14 +1484,14 @@ ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: s_mov_b32 s33, s6 +; GFX9-NEXT: s_mov_b32 s33, s8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_call: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s6, s33 +; GFX10-NEXT: s_mov_b32 s7, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1515,7 +1515,7 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: s_mov_b32 s33, s6 +; GFX10-NEXT: s_mov_b32 s33, s7 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: @@ -1528,7 +1528,7 @@ ; GCN-LABEL: test_call_v2bf16: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s33 +; GCN-NEXT: s_mov_b32 s10, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1556,14 +1556,14 @@ ; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s8 +; GCN-NEXT: s_mov_b32 s33, s10 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_call_v2bf16: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s8, s33 +; GFX7-NEXT: s_mov_b32 s10, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1590,14 +1590,14 @@ ; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 -; GFX7-NEXT: s_mov_b32 s33, s8 +; GFX7-NEXT: s_mov_b32 s33, s10 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_call_v2bf16: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s6, s33 +; GFX8-NEXT: s_mov_b32 s8, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1619,14 +1619,14 @@ ; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 -; GFX8-NEXT: s_mov_b32 s33, s6 +; GFX8-NEXT: s_mov_b32 s33, s8 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_call_v2bf16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s6, s33 +; GFX9-NEXT: s_mov_b32 s8, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1648,14 +1648,14 @@ ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: s_mov_b32 s33, s6 +; GFX9-NEXT: s_mov_b32 s33, s8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_call_v2bf16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s6, s33 +; GFX10-NEXT: s_mov_b32 s7, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1679,7 +1679,7 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: s_mov_b32 s33, s6 +; GFX10-NEXT: s_mov_b32 s33, s7 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: @@ -1692,7 +1692,7 @@ ; GCN-LABEL: test_call_v3bf16: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s33 +; GCN-NEXT: s_mov_b32 s10, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1721,14 +1721,14 @@ ; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s8 +; GCN-NEXT: s_mov_b32 s33, s10 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_call_v3bf16: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s8, s33 +; GFX7-NEXT: s_mov_b32 s10, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1756,14 +1756,14 @@ ; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 -; GFX7-NEXT: s_mov_b32 s33, s8 +; GFX7-NEXT: s_mov_b32 s33, s10 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_call_v3bf16: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s6, s33 +; GFX8-NEXT: s_mov_b32 s8, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1789,14 +1789,14 @@ ; GFX8-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 -; GFX8-NEXT: s_mov_b32 s33, s6 +; GFX8-NEXT: s_mov_b32 s33, s8 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_call_v3bf16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s6, s33 +; GFX9-NEXT: s_mov_b32 s8, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1824,14 +1824,14 @@ ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: s_mov_b32 s33, s6 +; GFX9-NEXT: s_mov_b32 s33, s8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_call_v3bf16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s6, s33 +; GFX10-NEXT: s_mov_b32 s7, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1860,7 +1860,7 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: s_mov_b32 s33, s6 +; GFX10-NEXT: s_mov_b32 s33, s7 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: @@ -1873,7 +1873,7 @@ ; GCN-LABEL: test_call_v4bf16: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s33 +; GCN-NEXT: s_mov_b32 s10, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1909,14 +1909,14 @@ ; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s8 +; GCN-NEXT: s_mov_b32 s33, s10 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_call_v4bf16: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s8, s33 +; GFX7-NEXT: s_mov_b32 s10, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1951,14 +1951,14 @@ ; GFX7-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 -; GFX7-NEXT: s_mov_b32 s33, s8 +; GFX7-NEXT: s_mov_b32 s33, s10 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_call_v4bf16: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s6, s33 +; GFX8-NEXT: s_mov_b32 s8, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1991,14 +1991,14 @@ ; GFX8-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 -; GFX8-NEXT: s_mov_b32 s33, s6 +; GFX8-NEXT: s_mov_b32 s33, s8 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_call_v4bf16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s6, s33 +; GFX9-NEXT: s_mov_b32 s8, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill @@ -2026,14 +2026,14 @@ ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: s_mov_b32 s33, s6 +; GFX9-NEXT: s_mov_b32 s33, s8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_call_v4bf16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s6, s33 +; GFX10-NEXT: s_mov_b32 s7, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill @@ -2063,7 +2063,7 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: s_mov_b32 s33, s6 +; GFX10-NEXT: s_mov_b32 s33, s7 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: @@ -2076,7 +2076,7 @@ ; GCN-LABEL: test_call_v8bf16: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s33 +; GCN-NEXT: s_mov_b32 s10, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill @@ -2128,14 +2128,14 @@ ; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s8 +; GCN-NEXT: s_mov_b32 s33, s10 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_call_v8bf16: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s8, s33 +; GFX7-NEXT: s_mov_b32 s10, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill @@ -2186,14 +2186,14 @@ ; GFX7-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 -; GFX7-NEXT: s_mov_b32 s33, s8 +; GFX7-NEXT: s_mov_b32 s33, s10 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_call_v8bf16: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s6, s33 +; GFX8-NEXT: s_mov_b32 s8, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill @@ -2240,14 +2240,14 @@ ; GFX8-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 -; GFX8-NEXT: s_mov_b32 s33, s6 +; GFX8-NEXT: s_mov_b32 s33, s8 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_call_v8bf16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s6, s33 +; GFX9-NEXT: s_mov_b32 s8, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill @@ -2283,14 +2283,14 @@ ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: s_mov_b32 s33, s6 +; GFX9-NEXT: s_mov_b32 s33, s8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_call_v8bf16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s6, s33 +; GFX10-NEXT: s_mov_b32 s7, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill @@ -2328,7 +2328,7 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: s_mov_b32 s33, s6 +; GFX10-NEXT: s_mov_b32 s33, s7 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: @@ -2341,7 +2341,7 @@ ; GCN-LABEL: test_call_v16bf16: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s33 +; GCN-NEXT: s_mov_b32 s10, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 ; 4-byte Folded Spill @@ -2425,14 +2425,14 @@ ; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s8 +; GCN-NEXT: s_mov_b32 s33, s10 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_call_v16bf16: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s8, s33 +; GFX7-NEXT: s_mov_b32 s10, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_store_dword v17, off, s[0:3], s33 ; 4-byte Folded Spill @@ -2515,14 +2515,14 @@ ; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 -; GFX7-NEXT: s_mov_b32 s33, s8 +; GFX7-NEXT: s_mov_b32 s33, s10 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_call_v16bf16: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s6, s33 +; GFX8-NEXT: s_mov_b32 s8, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill @@ -2597,14 +2597,14 @@ ; GFX8-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 -; GFX8-NEXT: s_mov_b32 s33, s6 +; GFX8-NEXT: s_mov_b32 s33, s8 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_call_v16bf16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s6, s33 +; GFX9-NEXT: s_mov_b32 s8, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill @@ -2656,14 +2656,14 @@ ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: s_mov_b32 s33, s6 +; GFX9-NEXT: s_mov_b32 s33, s8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_call_v16bf16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s6, s33 +; GFX10-NEXT: s_mov_b32 s7, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill @@ -2717,7 +2717,7 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: s_mov_b32 s33, s6 +; GFX10-NEXT: s_mov_b32 s33, s7 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll --- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll @@ -591,8 +591,7 @@ ; VGPR. ; GCN-LABEL: {{^}}callee_need_to_spill_fp_to_memory_full_reserved_vgpr: ; MUBUF: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 -; FLATSCR: s_mov_b32 s33, s0 -; MUBUF: s_mov_b32 s33, s32 +; GCN: s_mov_b32 s33, s32 ; MUBUF: s_xor_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF: s_mov_b64 exec, [[COPY_EXEC1]] ; MUBUF: v_mov_b32_e32 [[TMP_VGPR1:v[0-9]+]], [[FP_SCRATCH_COPY]] @@ -632,14 +631,14 @@ ; Make sure that the FP save happens after restoring exec from the same ; register. ; GCN-LABEL: {{^}}callee_need_to_spill_fp_to_reg: -; FLATSCR: s_mov_b32 s0, s33 +; FLATSCR: s_mov_b32 s2, s33 ; FLATSCR: s_mov_b32 s33, s32 ; GCN-NOT: v_writelane_b32 v40, s33 -; FLATSCR: s_or_saveexec_b64 s[2:3], -1 -; FLATSCR: s_mov_b64 exec, s[2:3] -; FLATSCR: s_or_saveexec_b64 s[2:3], -1 +; FLATSCR: s_or_saveexec_b64 s[4:5], -1 +; FLATSCR: s_mov_b64 exec, s[4:5] +; FLATSCR: s_or_saveexec_b64 s[4:5], -1 ; GCN-NOT: v_readlane_b32 s33, v40 -; FLATSCR: s_mov_b32 s33, s0 +; FLATSCR: s_mov_b32 s33, s2 ; GCN: s_setpc_b64 define void @callee_need_to_spill_fp_to_reg() #1 { call void asm sideeffect "; clobber nonpreserved SGPRs and 64 CSRs", diff --git a/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll b/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll --- a/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll @@ -8,7 +8,7 @@ ; SDAG-LABEL: gfx_func: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: s_mov_b32 s36, s33 +; SDAG-NEXT: s_mov_b32 s38, s33 ; SDAG-NEXT: s_mov_b32 s33, s32 ; SDAG-NEXT: s_or_saveexec_b64 s[34:35], -1 ; SDAG-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill @@ -81,14 +81,14 @@ ; SDAG-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; SDAG-NEXT: s_mov_b64 exec, s[34:35] ; SDAG-NEXT: s_addk_i32 s32, 0xfc00 -; SDAG-NEXT: s_mov_b32 s33, s36 +; SDAG-NEXT: s_mov_b32 s33, s38 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: gfx_func: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s36, s33 +; GISEL-NEXT: s_mov_b32 s38, s33 ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill @@ -161,7 +161,7 @@ ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[34:35] ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 -; GISEL-NEXT: s_mov_b32 s33, s36 +; GISEL-NEXT: s_mov_b32 s33, s38 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] call void @extern_c_func() diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll @@ -7117,7 +7117,7 @@ ; GFX9-LABEL: tail_call_byval_align16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s6, s33 +; GFX9-NEXT: s_mov_b32 s8, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill @@ -7202,14 +7202,14 @@ ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0xf800 -; GFX9-NEXT: s_mov_b32 s33, s6 +; GFX9-NEXT: s_mov_b32 s33, s8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: tail_call_byval_align16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s6, s33 +; GFX10-NEXT: s_mov_b32 s7, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill @@ -7297,14 +7297,14 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_addk_i32 s32, 0xfc00 -; GFX10-NEXT: s_mov_b32 s33, s6 +; GFX10-NEXT: s_mov_b32 s33, s7 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: tail_call_byval_align16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s4, s33 +; GFX11-NEXT: s_mov_b32 s5, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:24 ; 4-byte Folded Spill @@ -7387,14 +7387,14 @@ ; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:24 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_addk_i32 s32, 0xffe0 -; GFX11-NEXT: s_mov_b32 s33, s4 +; GFX11-NEXT: s_mov_b32 s33, s5 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SCRATCH-LABEL: tail_call_byval_align16: ; GFX10-SCRATCH: ; %bb.0: ; %entry ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, s33 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 offset:24 ; 4-byte Folded Spill @@ -7479,7 +7479,7 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_addk_i32 s32, 0xffe0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s4 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s5 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] entry: diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll @@ -183,9 +183,9 @@ ; GFX11-LABEL: void_func_void_clobber_s28_s29: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-NEXT: s_xor_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v0, s28, 0 ; GFX11-NEXT: v_writelane_b32 v0, s29, 1 ; GFX11-NEXT: v_writelane_b32 v0, s30, 2 @@ -201,9 +201,9 @@ ; GFX11-NEXT: v_readlane_b32 s30, v0, 2 ; GFX11-NEXT: v_readlane_b32 s29, v0, 1 ; GFX11-NEXT: v_readlane_b32 s28, v0, 0 -; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-NEXT: s_xor_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] call void asm sideeffect "; clobber", "~{s[30:31]}"() #0 @@ -854,54 +854,54 @@ ; GFX9-LABEL: void_func_void_clobber_s33: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_xor_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: v_writelane_b32 v0, s33, 0 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; clobber ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: v_readlane_b32 s33, v0, 0 -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_xor_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: void_func_void_clobber_s33: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX10-NEXT: s_xor_saveexec_b32 s5, -1 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s5 ; GFX10-NEXT: v_writelane_b32 v0, s33, 0 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; clobber ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: v_readlane_b32 s33, v0, 0 -; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX10-NEXT: s_xor_saveexec_b32 s5, -1 ; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s5 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: void_func_void_clobber_s33: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-NEXT: s_xor_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v0, s33, 0 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; clobber ; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s33, v0, 0 -; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-NEXT: s_xor_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] call void asm sideeffect "; clobber", "~{s33}"() #0 @@ -912,54 +912,54 @@ ; GFX9-LABEL: void_func_void_clobber_s34: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_xor_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: v_writelane_b32 v0, s34, 0 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; clobber ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: v_readlane_b32 s34, v0, 0 -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_xor_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: void_func_void_clobber_s34: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX10-NEXT: s_xor_saveexec_b32 s5, -1 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s5 ; GFX10-NEXT: v_writelane_b32 v0, s34, 0 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; clobber ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: v_readlane_b32 s34, v0, 0 -; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX10-NEXT: s_xor_saveexec_b32 s5, -1 ; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s5 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: void_func_void_clobber_s34: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-NEXT: s_xor_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v0, s34, 0 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; clobber ; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s34, v0, 0 -; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-NEXT: s_xor_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] call void asm sideeffect "; clobber", "~{s34}"() #0 diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll @@ -79,7 +79,7 @@ ; GFX11-LABEL: call_i1: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s2, s33 +; GFX11-NEXT: s_mov_b32 s3, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_store_b32 off, v1, s33 ; 4-byte Folded Spill @@ -100,7 +100,7 @@ ; GFX11-NEXT: scratch_load_b32 v1, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: s_mov_b32 s33, s2 +; GFX11-NEXT: s_mov_b32 s33, s3 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: @@ -184,7 +184,7 @@ ; GFX11-LABEL: call_i16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s2, s33 +; GFX11-NEXT: s_mov_b32 s3, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_store_b32 off, v1, s33 ; 4-byte Folded Spill @@ -205,7 +205,7 @@ ; GFX11-NEXT: scratch_load_b32 v1, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: s_mov_b32 s33, s2 +; GFX11-NEXT: s_mov_b32 s33, s3 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: @@ -289,7 +289,7 @@ ; GFX11-LABEL: call_2xi16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s2, s33 +; GFX11-NEXT: s_mov_b32 s3, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_store_b32 off, v1, s33 ; 4-byte Folded Spill @@ -310,7 +310,7 @@ ; GFX11-NEXT: scratch_load_b32 v1, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: s_mov_b32 s33, s2 +; GFX11-NEXT: s_mov_b32 s33, s3 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: @@ -402,7 +402,7 @@ ; GFX11-LABEL: call_3xi16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s2, s33 +; GFX11-NEXT: s_mov_b32 s3, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill @@ -423,7 +423,7 @@ ; GFX11-NEXT: scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: s_mov_b32 s33, s2 +; GFX11-NEXT: s_mov_b32 s33, s3 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: @@ -890,7 +890,7 @@ ; GFX11-LABEL: call_100xi32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s2, s33 +; GFX11-NEXT: s_mov_b32 s3, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_store_b32 off, v100, s33 offset:128 ; 4-byte Folded Spill @@ -976,7 +976,7 @@ ; GFX11-NEXT: scratch_load_b32 v100, off, s33 offset:128 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_addk_i32 s32, 0xff70 -; GFX11-NEXT: s_mov_b32 s33, s2 +; GFX11-NEXT: s_mov_b32 s33, s3 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll --- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll @@ -1327,7 +1327,7 @@ ; GCN-LABEL: test_indirect_call_vgpr_ptr_arg_and_reuse: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s10, s33 +; GCN-NEXT: s_mov_b32 s12, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill @@ -1418,14 +1418,14 @@ ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s10 +; GCN-NEXT: s_mov_b32 s33, s12 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_indirect_call_vgpr_ptr_arg_and_reuse: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s10, s33 +; GISEL-NEXT: s_mov_b32 s12, s33 ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill @@ -1516,7 +1516,7 @@ ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 -; GISEL-NEXT: s_mov_b32 s33, s10 +; GISEL-NEXT: s_mov_b32 s33, s12 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void %fptr(i32 %i) @@ -1531,7 +1531,7 @@ ; GCN-LABEL: test_indirect_call_vgpr_ptr_arg_and_return: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s10, s33 +; GCN-NEXT: s_mov_b32 s12, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1620,14 +1620,14 @@ ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s10 +; GCN-NEXT: s_mov_b32 s33, s12 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_indirect_call_vgpr_ptr_arg_and_return: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s10, s33 +; GISEL-NEXT: s_mov_b32 s12, s33 ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1716,7 +1716,7 @@ ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 -; GISEL-NEXT: s_mov_b32 s33, s10 +; GISEL-NEXT: s_mov_b32 s33, s12 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] %ret = call amdgpu_gfx i32 %fptr(i32 %i) @@ -1728,7 +1728,7 @@ ; GCN-LABEL: test_indirect_tail_call_vgpr_ptr: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s10, s33 +; GCN-NEXT: s_mov_b32 s12, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1814,14 +1814,14 @@ ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s10 +; GCN-NEXT: s_mov_b32 s33, s12 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_indirect_tail_call_vgpr_ptr: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s10, s33 +; GISEL-NEXT: s_mov_b32 s12, s33 ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1907,7 +1907,7 @@ ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 -; GISEL-NEXT: s_mov_b32 s33, s10 +; GISEL-NEXT: s_mov_b32 s33, s12 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] tail call amdgpu_gfx void %fptr() diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll --- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll @@ -7,7 +7,7 @@ ; GFX11-LABEL: f0: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s2, s33 +; GFX11-NEXT: s_mov_b32 s3, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_store_b32 off, v4, s33 ; 4-byte Folded Spill @@ -28,7 +28,7 @@ ; GFX11-NEXT: scratch_load_b32 v4, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: s_mov_b32 s33, s2 +; GFX11-NEXT: s_mov_b32 s33, s3 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] bb: @@ -53,20 +53,20 @@ ; GFX11-LABEL: f2: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_mov_b64 s[16:17], s[4:5] +; GFX11-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX11-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX11-NEXT: s_load_b32 s2, s[16:17], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v31, v0 -; GFX11-NEXT: s_load_b32 s24, s[16:17], 0x24 ; GFX11-NEXT: s_mov_b32 s18, s14 ; GFX11-NEXT: s_mov_b32 s12, s13 -; GFX11-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v31 -; GFX11-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX11-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX11-NEXT: s_mov_b32 s3, 0 +; GFX11-NEXT: s_mov_b32 s20, 0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v31 ; GFX11-NEXT: s_mov_b32 s0, -1 ; GFX11-NEXT: s_mov_b32 s19, exec_lo ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mul_lo_u32 v0, s24, v0 +; GFX11-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11-NEXT: s_cbranch_execz .LBB2_13 @@ -74,7 +74,7 @@ ; GFX11-NEXT: s_load_b128 s[20:23], s[16:17], 0x2c ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_bitcmp1_b32 s21, 0 -; GFX11-NEXT: s_cselect_b32 s25, -1, 0 +; GFX11-NEXT: s_cselect_b32 s24, -1, 0 ; GFX11-NEXT: s_bitcmp0_b32 s21, 0 ; GFX11-NEXT: s_mov_b32 s21, 0 ; GFX11-NEXT: s_cbranch_scc0 .LBB2_3 @@ -90,41 +90,40 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_mov_b32 s1, -1 -; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3 -; GFX11-NEXT: s_cbranch_vccz .LBB2_4 +; GFX11-NEXT: s_cbranch_execz .LBB2_4 ; GFX11-NEXT: s_branch .LBB2_12 ; GFX11-NEXT: .LBB2_3: ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: .LBB2_4: ; %bb16 -; GFX11-NEXT: s_load_b32 s2, s[16:17], 0x54 +; GFX11-NEXT: s_load_b32 s3, s[16:17], 0x54 ; GFX11-NEXT: s_bitcmp1_b32 s23, 0 ; GFX11-NEXT: s_cselect_b32 s0, -1, 0 -; GFX11-NEXT: s_and_b32 s3, s23, 1 +; GFX11-NEXT: s_and_b32 s9, s23, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bitcmp1_b32 s2, 0 -; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_bitcmp1_b32 s3, 0 +; GFX11-NEXT: s_mov_b32 s3, -1 ; GFX11-NEXT: s_cselect_b32 s8, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s3, 0 +; GFX11-NEXT: s_cmp_eq_u32 s9, 0 ; GFX11-NEXT: s_cbranch_scc0 .LBB2_8 ; GFX11-NEXT: ; %bb.5: ; %bb18.preheader ; GFX11-NEXT: s_load_b128 s[28:31], s[16:17], 0x44 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mul_hi_u32 s2, s29, s28 -; GFX11-NEXT: s_mul_i32 s3, s29, s28 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_alignbit_b32 v0, s2, s3, 1 -; GFX11-NEXT: s_mov_b32 s3, 0 -; GFX11-NEXT: v_readfirstlane_b32 s2, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s25 +; GFX11-NEXT: s_mul_hi_u32 s3, s29, s28 +; GFX11-NEXT: s_mul_i32 s9, s29, s28 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_alignbit_b32 v0, s3, s9, 1 +; GFX11-NEXT: v_readfirstlane_b32 s3, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s24 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_or_b32 s2, s2, 1 -; GFX11-NEXT: s_lshr_b32 s2, s2, s30 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_mul_i32 s2, s2, s22 -; GFX11-NEXT: s_mul_i32 s2, s2, s20 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_or_b32 s2, s24, s2 +; GFX11-NEXT: s_or_b32 s3, s3, 1 +; GFX11-NEXT: s_lshr_b32 s3, s3, s30 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mul_i32 s9, s3, s22 +; GFX11-NEXT: s_mov_b32 s3, 0 +; GFX11-NEXT: s_mul_i32 s9, s9, s20 +; GFX11-NEXT: s_or_b32 s2, s2, s9 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b64 s[22:23], s[2:3], 1 ; GFX11-NEXT: global_load_u16 v2, v1, s[22:23] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -152,10 +151,10 @@ ; GFX11-NEXT: s_or_b32 s3, s2, s3 ; GFX11-NEXT: s_cbranch_vccz .LBB2_6 ; GFX11-NEXT: ; %bb.7: ; %Flow -; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_mov_b32 s3, 0 ; GFX11-NEXT: .LBB2_8: ; %Flow12 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s3 ; GFX11-NEXT: s_cbranch_vccz .LBB2_12 ; GFX11-NEXT: ; %bb.9: ; GFX11-NEXT: s_xor_b32 s0, s8, -1 @@ -167,11 +166,11 @@ ; GFX11-NEXT: ; %bb.11: ; %Flow6 ; GFX11-NEXT: s_mov_b32 s21, -1 ; GFX11-NEXT: .LBB2_12: ; %Flow11 -; GFX11-NEXT: s_and_b32 s3, s1, exec_lo +; GFX11-NEXT: s_and_b32 s20, s1, exec_lo ; GFX11-NEXT: s_or_not1_b32 s0, s21, exec_lo ; GFX11-NEXT: .LBB2_13: ; %Flow9 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s19 -; GFX11-NEXT: s_and_saveexec_b32 s19, s0 +; GFX11-NEXT: s_and_saveexec_b32 s2, s0 ; GFX11-NEXT: s_cbranch_execz .LBB2_15 ; GFX11-NEXT: ; %bb.14: ; %bb43 ; GFX11-NEXT: s_add_u32 s8, s16, 0x58 @@ -184,10 +183,10 @@ ; GFX11-NEXT: s_mov_b32 s14, s15 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_or_b32 s3, s3, exec_lo +; GFX11-NEXT: s_or_b32 s20, s20, exec_lo ; GFX11-NEXT: .LBB2_15: ; %Flow14 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s19 -; GFX11-NEXT: s_and_saveexec_b32 s0, s3 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: s_and_saveexec_b32 s0, s20 ; GFX11-NEXT: ; %bb.16: ; %UnifiedUnreachableBlock ; GFX11-NEXT: ; divergent unreachable ; GFX11-NEXT: ; %bb.17: ; %UnifiedReturnBlock diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -3151,18 +3151,18 @@ ; GCN-HSA-NEXT: s_lshr_b32 s33, s9, 16 ; GCN-HSA-NEXT: s_lshr_b32 s35, s8, 16 ; GCN-HSA-NEXT: s_lshr_b32 s37, s11, 16 -; GCN-HSA-NEXT: s_lshr_b32 s38, s10, 16 -; GCN-HSA-NEXT: s_lshr_b32 s39, s13, 16 -; GCN-HSA-NEXT: s_lshr_b32 s40, s12, 16 -; GCN-HSA-NEXT: s_lshr_b32 s41, s15, 16 -; GCN-HSA-NEXT: s_lshr_b32 s42, s14, 16 +; GCN-HSA-NEXT: s_lshr_b32 s39, s10, 16 +; GCN-HSA-NEXT: s_lshr_b32 s40, s13, 16 +; GCN-HSA-NEXT: s_lshr_b32 s41, s12, 16 +; GCN-HSA-NEXT: s_lshr_b32 s42, s15, 16 +; GCN-HSA-NEXT: s_lshr_b32 s43, s14, 16 ; GCN-HSA-NEXT: s_and_b32 s25, s1, 0xffff ; GCN-HSA-NEXT: s_and_b32 s27, s0, 0xffff ; GCN-HSA-NEXT: s_and_b32 s29, s3, 0xffff ; GCN-HSA-NEXT: s_and_b32 s31, s2, 0xffff ; GCN-HSA-NEXT: s_and_b32 s34, s5, 0xffff ; GCN-HSA-NEXT: s_and_b32 s36, s4, 0xffff -; GCN-HSA-NEXT: s_and_b32 s43, s7, 0xffff +; GCN-HSA-NEXT: s_and_b32 s38, s7, 0xffff ; GCN-HSA-NEXT: s_and_b32 s44, s6, 0xffff ; GCN-HSA-NEXT: s_and_b32 s45, s9, 0xffff ; GCN-HSA-NEXT: s_and_b32 s46, s8, 0xffff @@ -3284,25 +3284,25 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s42 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s43 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s51 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s41 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s42 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s50 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s40 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s41 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s49 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s39 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s40 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s16, 64 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s48 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s38 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s39 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s47 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s37 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 @@ -3321,7 +3321,7 @@ ; GCN-HSA-NEXT: s_add_u32 s0, s16, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s44 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s30 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s43 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s38 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s28 ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -3397,10 +3397,10 @@ ; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s67, s9, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s68, s8, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s67, s11, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s68, s10, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s13, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s12, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s15, s15, 0xffff @@ -3419,113 +3419,114 @@ ; GCN-NOHSA-VI-NEXT: s_and_b32 s26, s26, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s29, s29, 0xffff ; GCN-NOHSA-VI-NEXT: s_and_b32 s28, s28, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s31, 0xffff -; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s30, 0xffff -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8 -; GCN-NOHSA-VI-NEXT: s_add_u32 s8, s36, 0xf0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s9 -; GCN-NOHSA-VI-NEXT: s_addc_u32 s9, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s8 +; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s31, 0xffff +; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s30, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0xf0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s11 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s66 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s65 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s9 -; GCN-NOHSA-VI-NEXT: s_add_u32 s8, s36, 0xe0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 +; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0xe0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s9, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s8 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s64 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s29 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s63 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s9 -; GCN-NOHSA-VI-NEXT: s_add_u32 s8, s36, 0xd0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 +; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0xd0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s9, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s8 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s62 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s27 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s61 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s9 -; GCN-NOHSA-VI-NEXT: s_add_u32 s8, s36, 0xc0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 +; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0xc0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s9, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s8 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s60 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s25 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s59 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s9 -; GCN-NOHSA-VI-NEXT: s_add_u32 s8, s36, 0xb0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 +; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0xb0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s9, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s8 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s58 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s23 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s57 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s9 -; GCN-NOHSA-VI-NEXT: s_add_u32 s8, s36, 0xa0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 +; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0xa0 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s9, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s8 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s56 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s21 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s55 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s9 -; GCN-NOHSA-VI-NEXT: s_add_u32 s8, s36, 0x90 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 +; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0x90 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s9, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s8 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s54 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s19 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s9 -; GCN-NOHSA-VI-NEXT: s_add_u32 s8, s36, 0x80 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 +; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0x80 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s9, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s8 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s52 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s17 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s51 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s9 -; GCN-NOHSA-VI-NEXT: s_add_u32 s8, s36, 0x70 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 +; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0x70 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s9, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s8 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s50 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s15 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s49 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s9 -; GCN-NOHSA-VI-NEXT: s_add_u32 s8, s36, 0x60 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 +; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0x60 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s9, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s8 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s39 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s38 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s9 -; GCN-NOHSA-VI-NEXT: s_add_u32 s8, s36, 0x50 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 +; GCN-NOHSA-VI-NEXT: s_add_u32 s10, s36, 0x50 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NOHSA-VI-NEXT: s_addc_u32 s9, s37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NOHSA-VI-NEXT: s_addc_u32 s11, s37, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s68 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s48 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s67 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s47 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s9 -; GCN-NOHSA-VI-NEXT: s_add_u32 s8, s36, 64 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s11 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NOHSA-VI-NEXT: s_add_u32 s8, s36, 64 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 ; GCN-NOHSA-VI-NEXT: s_addc_u32 s9, s37, 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s68 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s46 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s67 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s45 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s9 ; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -6461,129 +6462,129 @@ ; GCN-HSA-NEXT: s_lshr_b32 s33, s4, 16 ; GCN-HSA-NEXT: s_lshr_b32 s34, s2, 16 ; GCN-HSA-NEXT: s_lshr_b32 s18, s0, 16 -; GCN-HSA-NEXT: s_and_b32 s35, s0, 0xffff -; GCN-HSA-NEXT: s_and_b32 s2, s2, 0xffff +; GCN-HSA-NEXT: s_and_b32 s0, s0, 0xffff +; GCN-HSA-NEXT: s_and_b32 s35, s2, 0xffff ; GCN-HSA-NEXT: s_and_b32 s4, s4, 0xffff ; GCN-HSA-NEXT: s_and_b32 s6, s6, 0xffff ; GCN-HSA-NEXT: s_and_b32 s8, s8, 0xffff ; GCN-HSA-NEXT: s_and_b32 s10, s10, 0xffff ; GCN-HSA-NEXT: s_and_b32 s12, s12, 0xffff ; GCN-HSA-NEXT: s_and_b32 s14, s14, 0xffff -; GCN-HSA-NEXT: s_and_b32 s36, s1, 0xffff -; GCN-HSA-NEXT: s_and_b32 s3, s3, 0xffff +; GCN-HSA-NEXT: s_and_b32 s1, s1, 0xffff +; GCN-HSA-NEXT: s_and_b32 s36, s3, 0xffff ; GCN-HSA-NEXT: s_and_b32 s5, s5, 0xffff ; GCN-HSA-NEXT: s_and_b32 s7, s7, 0xffff ; GCN-HSA-NEXT: s_and_b32 s9, s9, 0xffff ; GCN-HSA-NEXT: s_and_b32 s11, s11, 0xffff ; GCN-HSA-NEXT: s_and_b32 s13, s13, 0xffff -; GCN-HSA-NEXT: s_and_b32 s0, s15, 0xffff -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xf0 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xd0 +; GCN-HSA-NEXT: s_and_b32 s2, s15, 0xffff +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xf0 +; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xd0 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s26 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xb0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xb0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s13 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s25 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x90 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s16, 0x90 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s11 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s24 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s16, 0x70 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s23 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s16, 0x50 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s22 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s16, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s21 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s16, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s36 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s20 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xe0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s36 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xe0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s19 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xc0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xc0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s27 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xa0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xa0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s28 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x80 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s16, 0x80 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s29 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s16, 0x60 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s30 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 64 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s16, 64 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s31 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s16, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s33 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s35 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s34 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s35 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s18 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -6919,12 +6920,12 @@ ; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, s15 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s20, s13 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s22, s11 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s24, s9 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s26, s7 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s28, s5 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s30, s3 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s34, s1 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s14, 16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s28, s9 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s24, s7 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s30, s5 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s34, s3 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s36, s1 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s14, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s12, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s40, s10, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s42, s8, 16 @@ -6934,7 +6935,7 @@ ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s50, s0, 16 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[52:53], s[20:21], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[54:55], s[18:19], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[56:57], s[22:23], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[0:1], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x100000 @@ -6966,49 +6967,49 @@ ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s57 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s10 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s25 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s28 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s29 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s2 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s3 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[34:35], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[30:31], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[28:29], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[26:27], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[36:37], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[34:35], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[30:31], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[50:51], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[48:49], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[46:47], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[44:45], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[42:43], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[40:41], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[38:39], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[46:47], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[44:45], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[42:43], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[40:41], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[38:39], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x100000 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:208 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:144 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(3) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s16 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s17 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s25 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s12 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s13 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s17 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s72 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s73 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s15 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s70 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s71 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s11 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s68 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s69 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 @@ -7023,32 +7024,32 @@ ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s61 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s58 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s59 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s36 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s37 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s26 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s27 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s22 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s23 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s34 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s35 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s36 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s37 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s20 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s30 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s34 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s35 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:160 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s18 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s19 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s28 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s29 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s30 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s31 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:128 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s26 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s28 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s29 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s25 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s9 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s6 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s7 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll --- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll @@ -3190,16 +3190,16 @@ ; GFX8-NOHSA-NEXT: s_lshr_b32 s23, s3, 24 ; GFX8-NOHSA-NEXT: s_lshr_b32 s25, s4, 24 ; GFX8-NOHSA-NEXT: s_lshr_b32 s27, s5, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s30, s6, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s31, s7, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s33, s8, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s34, s9, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s35, s10, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s36, s11, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s37, s12, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s38, s13, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s39, s14, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s28, s15, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s29, s6, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s33, s7, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s34, s8, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s35, s9, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s36, s10, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s37, s11, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s38, s12, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s39, s13, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s40, s14, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s30, s15, 24 ; GFX8-NOHSA-NEXT: s_and_b32 s20, s0, 0xff ; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s0 ; GFX8-NOHSA-NEXT: s_bfe_u32 s0, s0, 0x80010 @@ -3212,7 +3212,7 @@ ; GFX8-NOHSA-NEXT: s_and_b32 s26, s3, 0xff ; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v7, 8, s3 ; GFX8-NOHSA-NEXT: s_bfe_u32 s3, s3, 0x80010 -; GFX8-NOHSA-NEXT: s_and_b32 s40, s4, 0xff +; GFX8-NOHSA-NEXT: s_and_b32 s28, s4, 0xff ; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v9, 8, s4 ; GFX8-NOHSA-NEXT: s_bfe_u32 s41, s4, 0x80010 ; GFX8-NOHSA-NEXT: s_and_b32 s4, s5, 0xff @@ -3235,25 +3235,25 @@ ; GFX8-NOHSA-NEXT: s_bfe_u32 s58, s13, 0x80010 ; GFX8-NOHSA-NEXT: s_and_b32 s59, s14, 0xff ; GFX8-NOHSA-NEXT: s_bfe_u32 s60, s14, 0x80010 -; GFX8-NOHSA-NEXT: s_and_b32 s29, s15, 0xff +; GFX8-NOHSA-NEXT: s_and_b32 s31, s15, 0xff ; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v11, 8, s15 ; GFX8-NOHSA-NEXT: s_bfe_u32 s15, s15, 0x80010 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s28 -; GFX8-NOHSA-NEXT: s_add_u32 s28, s16, 0xf0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s29 -; GFX8-NOHSA-NEXT: s_addc_u32 s29, s17, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s28 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s30 +; GFX8-NOHSA-NEXT: s_add_u32 s30, s16, 0xf0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s31 +; GFX8-NOHSA-NEXT: s_addc_u32 s31, s17, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s30 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s15 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s29 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s31 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s40 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s28 ; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v11, 8, s14 ; GFX8-NOHSA-NEXT: s_add_u32 s14, s16, 0xe0 ; GFX8-NOHSA-NEXT: s_addc_u32 s15, s17, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s14 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s59 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s60 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s39 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s40 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s15 ; GFX8-NOHSA-NEXT: s_add_u32 s14, s16, 0xd0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] @@ -3262,7 +3262,7 @@ ; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v11, 8, s13 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s57 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s58 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s38 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s39 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s15 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s26 @@ -3274,14 +3274,14 @@ ; GFX8-NOHSA-NEXT: s_add_u32 s12, s16, 0xb0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s55 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s56 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s37 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s38 ; GFX8-NOHSA-NEXT: s_addc_u32 s13, s17, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s13 ; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v11, 8, s11 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s53 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s54 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s36 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s37 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s12 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s24 @@ -3293,14 +3293,14 @@ ; GFX8-NOHSA-NEXT: s_add_u32 s10, s16, 0x90 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s51 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s52 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s35 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s36 ; GFX8-NOHSA-NEXT: s_addc_u32 s11, s17, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s11 ; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v11, 8, s9 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s49 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s50 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s34 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s35 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s10 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s22 @@ -3312,14 +3312,14 @@ ; GFX8-NOHSA-NEXT: s_add_u32 s8, s16, 0x70 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s47 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s48 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s33 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s34 ; GFX8-NOHSA-NEXT: s_addc_u32 s9, s17, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s9 ; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v11, 8, s7 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s45 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s46 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s31 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s33 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s8 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s20 @@ -3329,7 +3329,7 @@ ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v15, s7 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s43 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s44 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s30 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s29 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v14, s6 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -5913,16 +5913,16 @@ ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_lshr_b32 s8, s7, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s10, s7, 8 -; GFX7-HSA-NEXT: s_mov_b32 s12, s7 -; GFX7-HSA-NEXT: s_lshr_b32 s14, s6, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s16, s6, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s18, s6, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s20, s5, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s22, s5, 8 -; GFX7-HSA-NEXT: s_mov_b32 s24, s5 -; GFX7-HSA-NEXT: s_lshr_b32 s26, s4, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s10, s7, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s12, s7, 8 +; GFX7-HSA-NEXT: s_mov_b32 s14, s7 +; GFX7-HSA-NEXT: s_lshr_b32 s16, s6, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s18, s6, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s20, s6, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s22, s5, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s24, s5, 8 +; GFX7-HSA-NEXT: s_mov_b32 s26, s5 +; GFX7-HSA-NEXT: s_lshr_b32 s8, s4, 16 ; GFX7-HSA-NEXT: s_lshr_b32 s28, s4, 24 ; GFX7-HSA-NEXT: s_lshr_b32 s30, s4, 8 ; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x80000 @@ -5931,6 +5931,7 @@ ; GFX7-HSA-NEXT: s_ashr_i64 s[38:39], s[6:7], 56 ; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[30:31], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[6:7], s[28:29], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 @@ -5940,60 +5941,59 @@ ; GFX7-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-HSA-NEXT: s_add_u32 s8, s0, 0x70 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9 -; GFX7-HSA-NEXT: s_addc_u32 s9, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10 +; GFX7-HSA-NEXT: s_add_u32 s10, s0, 0x70 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s11 +; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s38 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s39 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9 -; GFX7-HSA-NEXT: s_add_u32 s8, s0, 0x60 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s9, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s11 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9 -; GFX7-HSA-NEXT: s_add_u32 s8, s0, 0x50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 +; GFX7-HSA-NEXT: s_add_u32 s10, s0, 0x60 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s9, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 +; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s15 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s17 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9 -; GFX7-HSA-NEXT: s_add_u32 s8, s0, 64 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s13 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 +; GFX7-HSA-NEXT: s_add_u32 s10, s0, 0x50 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s9, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s36 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s37 +; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s16 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s17 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s18 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s19 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9 -; GFX7-HSA-NEXT: s_add_u32 s8, s0, 48 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 +; GFX7-HSA-NEXT: s_add_u32 s10, s0, 64 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s9, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s20 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s21 +; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s36 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s37 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s21 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 +; GFX7-HSA-NEXT: s_add_u32 s10, s0, 48 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s22 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s23 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s34 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s35 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9 -; GFX7-HSA-NEXT: s_add_u32 s8, s0, 32 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 +; GFX7-HSA-NEXT: s_add_u32 s10, s0, 32 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s9, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s25 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s22 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s23 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9 +; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s26 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s27 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s25 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6 @@ -6001,8 +6001,8 @@ ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7 ; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s26 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s27 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -6336,23 +6336,23 @@ ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_lshr_b32 s3, s4, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s12, s5, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s13, s6, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s14, s7, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s15, s8, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s16, s9, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s17, s10, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s18, s11, 24 -; GFX7-HSA-NEXT: s_bfe_u32 s19, s11, 0x80008 -; GFX7-HSA-NEXT: s_bfe_u32 s20, s10, 0x80008 -; GFX7-HSA-NEXT: s_bfe_u32 s21, s9, 0x80008 -; GFX7-HSA-NEXT: s_bfe_u32 s22, s8, 0x80008 -; GFX7-HSA-NEXT: s_bfe_u32 s23, s7, 0x80008 -; GFX7-HSA-NEXT: s_bfe_u32 s24, s6, 0x80008 -; GFX7-HSA-NEXT: s_bfe_u32 s25, s5, 0x80008 +; GFX7-HSA-NEXT: s_lshr_b32 s12, s4, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s13, s5, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s14, s6, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s15, s7, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s16, s8, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s17, s9, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s18, s10, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s19, s11, 24 +; GFX7-HSA-NEXT: s_bfe_u32 s20, s11, 0x80008 +; GFX7-HSA-NEXT: s_bfe_u32 s21, s10, 0x80008 +; GFX7-HSA-NEXT: s_bfe_u32 s22, s9, 0x80008 +; GFX7-HSA-NEXT: s_bfe_u32 s23, s8, 0x80008 +; GFX7-HSA-NEXT: s_bfe_u32 s24, s7, 0x80008 +; GFX7-HSA-NEXT: s_bfe_u32 s25, s6, 0x80008 +; GFX7-HSA-NEXT: s_bfe_u32 s26, s5, 0x80008 ; GFX7-HSA-NEXT: s_bfe_u32 s2, s4, 0x80008 -; GFX7-HSA-NEXT: s_and_b32 s26, s4, 0xff +; GFX7-HSA-NEXT: s_and_b32 s3, s4, 0xff ; GFX7-HSA-NEXT: s_and_b32 s27, s5, 0xff ; GFX7-HSA-NEXT: s_and_b32 s28, s6, 0xff ; GFX7-HSA-NEXT: s_and_b32 s29, s7, 0xff @@ -6372,109 +6372,109 @@ ; GFX7-HSA-NEXT: s_add_u32 s4, s0, 0xf0 ; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s18 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s19 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-HSA-NEXT: s_add_u32 s4, s0, 0xd0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s17 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s18 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-HSA-NEXT: s_add_u32 s4, s0, 0xb0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s16 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s17 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-HSA-NEXT: s_add_u32 s4, s0, 0x90 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s15 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-HSA-NEXT: s_add_u32 s4, s0, 0x70 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s15 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-HSA-NEXT: s_add_u32 s4, s0, 0x50 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s13 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s14 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-HSA-NEXT: s_add_u32 s4, s0, 48 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s36 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s13 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-HSA-NEXT: s_add_u32 s4, s0, 16 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s35 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s12 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-HSA-NEXT: s_add_u32 s4, s0, 0xe0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s19 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s20 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-HSA-NEXT: s_add_u32 s4, s0, 0xc0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s33 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s21 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-HSA-NEXT: s_add_u32 s4, s0, 0xa0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s31 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s21 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s22 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-HSA-NEXT: s_add_u32 s4, s0, 0x80 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s30 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s22 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s23 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-HSA-NEXT: s_add_u32 s4, s0, 0x60 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s29 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s23 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s24 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-HSA-NEXT: s_add_u32 s4, s0, 64 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s28 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s25 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-HSA-NEXT: s_add_u32 s4, s0, 32 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s27 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s25 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s26 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s26 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -6976,17 +6976,17 @@ ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_lshr_b32 s14, s7, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s38, s7, 8 -; GFX7-HSA-NEXT: s_mov_b32 s40, s7 -; GFX7-HSA-NEXT: s_lshr_b32 s42, s6, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s44, s6, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s46, s6, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s48, s5, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s50, s5, 8 -; GFX7-HSA-NEXT: s_mov_b32 s52, s5 -; GFX7-HSA-NEXT: s_lshr_b32 s54, s4, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s56, s4, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s58, s4, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s40, s7, 8 +; GFX7-HSA-NEXT: s_mov_b32 s42, s7 +; GFX7-HSA-NEXT: s_lshr_b32 s44, s6, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s46, s6, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s48, s6, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s50, s5, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s52, s5, 8 +; GFX7-HSA-NEXT: s_mov_b32 s54, s5 +; GFX7-HSA-NEXT: s_lshr_b32 s56, s4, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s58, s4, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s60, s4, 8 ; GFX7-HSA-NEXT: s_lshr_b32 s36, s3, 16 ; GFX7-HSA-NEXT: s_lshr_b32 s30, s3, 8 ; GFX7-HSA-NEXT: s_mov_b32 s34, s3 @@ -6994,15 +6994,15 @@ ; GFX7-HSA-NEXT: s_lshr_b32 s26, s2, 24 ; GFX7-HSA-NEXT: s_lshr_b32 s22, s2, 8 ; GFX7-HSA-NEXT: s_lshr_b32 s18, s1, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s60, s1, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s62, s1, 8 ; GFX7-HSA-NEXT: s_mov_b32 s16, s1 -; GFX7-HSA-NEXT: s_lshr_b32 s62, s0, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s64, s0, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s66, s0, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s64, s0, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s66, s0, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s68, s0, 8 ; GFX7-HSA-NEXT: s_bfe_i64 s[12:13], s[2:3], 0x80000 ; GFX7-HSA-NEXT: s_ashr_i64 s[20:21], s[2:3], 56 ; GFX7-HSA-NEXT: s_bfe_i64 s[24:25], s[4:5], 0x80000 -; GFX7-HSA-NEXT: s_ashr_i64 s[68:69], s[4:5], 56 +; GFX7-HSA-NEXT: s_ashr_i64 s[38:39], s[4:5], 56 ; GFX7-HSA-NEXT: s_ashr_i64 s[2:3], s[6:7], 56 ; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[14:15], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x80000 @@ -7012,11 +7012,11 @@ ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[66:67], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[64:65], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[6:7], s[62:63], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[68:69], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[66:67], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[6:7], s[64:65], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[14:15], s[60:61], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[14:15], s[62:63], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 @@ -7024,6 +7024,7 @@ ; GFX7-HSA-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x80000 @@ -7034,24 +7035,23 @@ ; GFX7-HSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 -; GFX7-HSA-NEXT: s_add_u32 s60, s8, 0xf0 -; GFX7-HSA-NEXT: s_addc_u32 s61, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s38 -; GFX7-HSA-NEXT: s_add_u32 s38, s8, 0xe0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s39 -; GFX7-HSA-NEXT: s_addc_u32 s39, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s38 -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s39 -; GFX7-HSA-NEXT: s_add_u32 s38, s8, 0xd0 -; GFX7-HSA-NEXT: s_addc_u32 s39, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s38 -; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s39 -; GFX7-HSA-NEXT: s_add_u32 s38, s8, 0xc0 -; GFX7-HSA-NEXT: s_addc_u32 s39, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s38 -; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s39 +; GFX7-HSA-NEXT: s_add_u32 s62, s8, 0xf0 +; GFX7-HSA-NEXT: s_addc_u32 s63, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s40 +; GFX7-HSA-NEXT: s_add_u32 s40, s8, 0xe0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s41 +; GFX7-HSA-NEXT: s_addc_u32 s41, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s40 +; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s41 +; GFX7-HSA-NEXT: s_add_u32 s40, s8, 0xd0 +; GFX7-HSA-NEXT: s_addc_u32 s41, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s40 +; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s41 +; GFX7-HSA-NEXT: s_add_u32 s40, s8, 0xc0 +; GFX7-HSA-NEXT: s_addc_u32 s41, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s38 ; GFX7-HSA-NEXT: s_add_u32 s38, s8, 0xb0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s39 ; GFX7-HSA-NEXT: s_addc_u32 s39, s9, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s38 ; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s39 @@ -7060,44 +7060,44 @@ ; GFX7-HSA-NEXT: v_mov_b32_e32 v32, s38 ; GFX7-HSA-NEXT: v_mov_b32_e32 v33, s39 ; GFX7-HSA-NEXT: s_add_u32 s38, s8, 0x90 -; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s60 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s40 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s41 +; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s62 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s42 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s43 ; GFX7-HSA-NEXT: s_addc_u32 s39, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s61 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s42 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s43 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s44 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s45 +; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s63 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s44 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s45 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s46 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s47 +; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s40 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] ; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s70 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s24 ; GFX7-HSA-NEXT: s_add_u32 s24, s8, 0x80 ; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s71 -; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s46 -; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s47 -; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s48 -; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s49 -; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s68 -; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s69 +; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s48 +; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s49 +; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s41 +; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s51 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s56 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s58 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s25 ; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s38 ; GFX7-HSA-NEXT: s_addc_u32 s25, s9, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s52 -; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s53 -; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s50 -; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s51 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s54 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s55 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s57 +; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s54 +; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s55 +; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s52 +; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s53 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s56 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s57 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s59 ; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s39 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s58 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s59 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s60 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s61 ; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s25 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[32:33], v[20:23] diff --git a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll --- a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll @@ -27,7 +27,7 @@ ; CHECK-LABEL: csr_vgpr_spill_fp_callee: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s18, s33 +; CHECK-NEXT: s_mov_b32 s24, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_xor_saveexec_b64 s[16:17], -1 ; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill @@ -54,7 +54,7 @@ ; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 -; CHECK-NEXT: s_mov_b32 s33, s18 +; CHECK-NEXT: s_mov_b32 s33, s24 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] bb: @@ -113,9 +113,9 @@ ; CHECK-NEXT: s_addc_u32 s17, s17, callee_has_fp@rel32@hi+12 ; CHECK-NEXT: v_readlane_b32 s33, v1, 0 ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; CHECK-NEXT: s_xor_saveexec_b64 s[18:19], -1 +; CHECK-NEXT: s_xor_saveexec_b64 s[20:21], -1 ; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b64 exec, s[18:19] +; CHECK-NEXT: s_mov_b64 exec, s[20:21] ; CHECK-NEXT: s_setpc_b64 s[16:17] bb: call void asm sideeffect "; clobber csr v40", "~{v40}"() @@ -172,7 +172,7 @@ ; CHECK-LABEL: caller_save_vgpr_spill_fp_tail_call: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s18, s33 +; CHECK-NEXT: s_mov_b32 s24, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_xor_saveexec_b64 s[16:17], -1 ; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill @@ -194,7 +194,7 @@ ; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 -; CHECK-NEXT: s_mov_b32 s33, s18 +; CHECK-NEXT: s_mov_b32 s33, s24 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -206,7 +206,7 @@ ; CHECK-LABEL: caller_save_vgpr_spill_fp: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s19, s33 +; CHECK-NEXT: s_mov_b32 s25, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_xor_saveexec_b64 s[16:17], -1 ; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill @@ -228,7 +228,7 @@ ; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 -; CHECK-NEXT: s_mov_b32 s33, s19 +; CHECK-NEXT: s_mov_b32 s33, s25 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll --- a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll @@ -16,7 +16,7 @@ ; GCN-LABEL: spill_sgpr_with_no_lower_vgpr_available: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s18, s33 +; GCN-NEXT: s_mov_b32 s24, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-NEXT: buffer_store_dword v255, off, s[0:3], s33 offset:452 ; 4-byte Folded Spill @@ -269,7 +269,7 @@ ; GCN-NEXT: buffer_load_dword v255, off, s[0:3], s33 offset:452 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_add_i32 s32, s32, 0xffff8c00 -; GCN-NEXT: s_mov_b32 s33, s18 +; GCN-NEXT: s_mov_b32 s33, s24 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, align 4, addrspace(5) @@ -310,7 +310,7 @@ ; GCN-LABEL: spill_to_lowest_available_vgpr: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s18, s33 +; GCN-NEXT: s_mov_b32 s24, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-NEXT: buffer_store_dword v254, off, s[0:3], s33 offset:448 ; 4-byte Folded Spill @@ -561,7 +561,7 @@ ; GCN-NEXT: buffer_load_dword v254, off, s[0:3], s33 offset:448 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_add_i32 s32, s32, 0xffff8c00 -; GCN-NEXT: s_mov_b32 s33, s18 +; GCN-NEXT: s_mov_b32 s33, s24 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, align 4, addrspace(5) @@ -1512,7 +1512,7 @@ ; GCN-LABEL: spill_sgpr_no_free_vgpr_ipra: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s18, s33 +; GCN-NEXT: s_mov_b32 s24, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_add_i32 s32, s32, 0x7400 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:444 ; 4-byte Folded Spill @@ -1782,7 +1782,7 @@ ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:440 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:444 ; 4-byte Folded Reload ; GCN-NEXT: s_add_i32 s32, s32, 0xffff8c00 -; GCN-NEXT: s_mov_b32 s33, s18 +; GCN-NEXT: s_mov_b32 s33, s24 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] call void @child_function_ipra() diff --git a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll --- a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll @@ -117,16 +117,16 @@ ; MUBUF-NEXT: ;;#ASMEND ; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: v_mov_b32_e32 v1, 0x1004 -; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], s32 offen ; 4-byte Folded Spill +; MUBUF-NEXT: s_add_i32 s10, s32, 0x40100 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s10 ; 4-byte Folded Spill ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ;;#ASMEND ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ;;#ASMEND ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ;;#ASMEND -; MUBUF-NEXT: v_mov_b32_e32 v1, 0x1004 -; MUBUF-NEXT: buffer_load_dword v0, v1, s[0:3], s32 offen ; 4-byte Folded Reload +; MUBUF-NEXT: s_add_i32 s10, s32, 0x40100 +; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s10 ; 4-byte Folded Reload ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ;;#ASMEND @@ -199,16 +199,15 @@ ; MUBUF-NEXT: ;;#ASMEND ; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: v_mov_b32_e32 v1, 0x1004 -; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; 4-byte Folded Spill +; MUBUF-NEXT: s_mov_b32 s10, 0x40100 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s10 ; 4-byte Folded Spill ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ;;#ASMEND ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ;;#ASMEND ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ;;#ASMEND -; MUBUF-NEXT: v_mov_b32_e32 v1, 0x1004 -; MUBUF-NEXT: buffer_load_dword v0, v1, s[0:3], 0 offen ; 4-byte Folded Reload +; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s10 ; 4-byte Folded Reload ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: ;;#ASMSTART ; MUBUF-NEXT: ;;#ASMEND @@ -637,5 +636,5 @@ attributes #0 = { nounwind } attributes #1 = { nounwind "amdgpu-num-sgpr"="17" "amdgpu-num-vgpr"="8" } -attributes #2 = { nounwind "amdgpu-num-sgpr"="14" "amdgpu-num-vgpr"="8" } -attributes #3 = { nounwind "amdgpu-num-sgpr"="16" "amdgpu-num-vgpr"="8" } +attributes #2 = { nounwind "amdgpu-num-sgpr"="16" "amdgpu-num-vgpr"="8" } +attributes #3 = { nounwind "amdgpu-num-sgpr"="18" "amdgpu-num-vgpr"="8" } diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll --- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll @@ -10085,14 +10085,24 @@ ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, -1, 0 ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v5, -1, v0 ; GFX6-NEXT: v_mov_b32_e32 v6, 0 -; GFX6-NEXT: s_mov_b32 s38, 0 -; GFX6-NEXT: s_mov_b32 s39, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX6-NEXT: v_lshlrev_b32_e32 v7, 8, v5 ; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:240 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:240 ; GFX6-NEXT: s_addc_u32 s41, s41, 0 +; GFX6-NEXT: s_mov_b32 s2, 0x83800 +; GFX6-NEXT: s_mov_b64 s[8:9], exec +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:224 ; GFX6-NEXT: s_mov_b32 s2, 0x83400 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill @@ -10101,7 +10111,7 @@ ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:224 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:208 ; GFX6-NEXT: s_mov_b32 s2, 0x83000 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill @@ -10110,7 +10120,7 @@ ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:208 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:192 ; GFX6-NEXT: s_mov_b32 s2, 0x82c00 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill @@ -10119,7 +10129,7 @@ ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:192 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:176 ; GFX6-NEXT: s_mov_b32 s2, 0x82800 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill @@ -10128,7 +10138,7 @@ ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:176 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:160 ; GFX6-NEXT: s_mov_b32 s2, 0x82400 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill @@ -10137,7 +10147,7 @@ ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:160 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:144 ; GFX6-NEXT: s_mov_b32 s2, 0x82000 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill @@ -10146,7 +10156,7 @@ ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:144 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:128 ; GFX6-NEXT: s_mov_b32 s2, 0x81c00 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill @@ -10155,7 +10165,7 @@ ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:128 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:112 ; GFX6-NEXT: s_mov_b32 s2, 0x81800 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill @@ -10164,7 +10174,7 @@ ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:112 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:96 ; GFX6-NEXT: s_mov_b32 s2, 0x81400 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill @@ -10173,7 +10183,7 @@ ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:96 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:80 ; GFX6-NEXT: s_mov_b32 s2, 0x81000 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill @@ -10182,17 +10192,8 @@ ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:80 -; GFX6-NEXT: s_mov_b32 s2, 0x80c00 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:64 -; GFX6-NEXT: s_mov_b32 s2, 0x80400 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:64 +; GFX6-NEXT: s_mov_b32 s2, 0x80800 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -10200,17 +10201,31 @@ ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 -; GFX6-NEXT: buffer_load_dwordx4 v[9:12], v[7:8], s[36:39], 0 addr64 offset:16 -; GFX6-NEXT: s_mov_b32 s2, 0x80800 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dwordx4 v[9:12], v[7:8], s[4:7], 0 addr64 offset:16 +; GFX6-NEXT: s_mov_b32 s2, 0x80c00 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v11, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v12, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_load_dwordx4 v[13:16], v[7:8], s[36:39], 0 addr64 offset:32 -; GFX6-NEXT: buffer_load_dwordx4 v[17:20], v[7:8], s[36:39], 0 addr64 offset:48 +; GFX6-NEXT: buffer_load_dwordx4 v[13:16], v[7:8], s[4:7], 0 addr64 offset:32 +; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] +; GFX6-NEXT: s_mov_b64 exec, 15 +; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_writelane_b32 v4, s0, 0 +; GFX6-NEXT: v_writelane_b32 v4, s1, 1 +; GFX6-NEXT: v_writelane_b32 v4, s2, 2 +; GFX6-NEXT: v_writelane_b32 v4, s3, 3 +; GFX6-NEXT: s_mov_b32 s10, 0x80400 +; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s10 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: s_mov_b64 exec, s[8:9] +; GFX6-NEXT: buffer_load_dwordx4 v[17:20], v[7:8], s[4:7], 0 addr64 offset:48 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 13, v0 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 16, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, 1 @@ -10230,7 +10245,7 @@ ; GFX6-NEXT: v_writelane_b32 v4, s9, 5 ; GFX6-NEXT: v_writelane_b32 v4, s10, 6 ; GFX6-NEXT: v_writelane_b32 v4, s11, 7 -; GFX6-NEXT: s_mov_b32 s12, 0x83800 +; GFX6-NEXT: s_mov_b32 s12, 0x83c00 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 @@ -10253,15 +10268,12 @@ ; GFX6-NEXT: ; def s[2:3] ; GFX6-NEXT: ;;#ASMEND ; GFX6-NEXT: ;;#ASMSTART -; GFX6-NEXT: ; def s[36:37] -; GFX6-NEXT: ;;#ASMEND -; GFX6-NEXT: ;;#ASMSTART ; GFX6-NEXT: ; def s33 ; GFX6-NEXT: ;;#ASMEND ; GFX6-NEXT: s_and_saveexec_b64 s[34:35], vcc ; GFX6-NEXT: s_cbranch_execz .LBB1_2 ; GFX6-NEXT: ; %bb.1: ; %bb0 -; GFX6-NEXT: s_mov_b64 s[44:45], exec +; GFX6-NEXT: s_mov_b64 s[36:37], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -10273,18 +10285,18 @@ ; GFX6-NEXT: v_writelane_b32 v4, s13, 5 ; GFX6-NEXT: v_writelane_b32 v4, s14, 6 ; GFX6-NEXT: v_writelane_b32 v4, s15, 7 -; GFX6-NEXT: v_mov_b32_e32 v7, 0x2100 -; GFX6-NEXT: buffer_store_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Spill +; GFX6-NEXT: s_mov_b32 s38, 0x84400 +; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s38 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[44:45] -; GFX6-NEXT: s_mov_b64 s[44:45], exec +; GFX6-NEXT: s_mov_b64 exec, s[36:37] +; GFX6-NEXT: s_mov_b64 s[36:37], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff -; GFX6-NEXT: v_mov_b32_e32 v7, 0x20e0 +; GFX6-NEXT: s_mov_b32 s38, 0x83c00 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s38 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_readlane_b32 s8, v4, 0 ; GFX6-NEXT: v_readlane_b32 s9, v4, 1 @@ -10296,8 +10308,8 @@ ; GFX6-NEXT: v_readlane_b32 s15, v4, 7 ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[44:45] -; GFX6-NEXT: s_mov_b64 s[44:45], exec +; GFX6-NEXT: s_mov_b64 exec, s[36:37] +; GFX6-NEXT: s_mov_b64 s[36:37], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -10309,18 +10321,18 @@ ; GFX6-NEXT: v_writelane_b32 v4, s21, 5 ; GFX6-NEXT: v_writelane_b32 v4, s22, 6 ; GFX6-NEXT: v_writelane_b32 v4, s23, 7 -; GFX6-NEXT: v_mov_b32_e32 v7, 0x2120 -; GFX6-NEXT: buffer_store_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Spill +; GFX6-NEXT: s_mov_b32 s38, 0x84c00 +; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s38 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[44:45] -; GFX6-NEXT: s_mov_b64 s[44:45], exec +; GFX6-NEXT: s_mov_b64 exec, s[36:37] +; GFX6-NEXT: s_mov_b64 s[36:37], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff -; GFX6-NEXT: v_mov_b32_e32 v7, 0x2100 +; GFX6-NEXT: s_mov_b32 s38, 0x84400 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s38 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_readlane_b32 s16, v4, 0 ; GFX6-NEXT: v_readlane_b32 s17, v4, 1 @@ -10332,8 +10344,8 @@ ; GFX6-NEXT: v_readlane_b32 s23, v4, 7 ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[44:45] -; GFX6-NEXT: s_mov_b64 s[44:45], exec +; GFX6-NEXT: s_mov_b64 exec, s[36:37] +; GFX6-NEXT: s_mov_b64 s[36:37], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -10345,18 +10357,18 @@ ; GFX6-NEXT: v_writelane_b32 v4, s29, 5 ; GFX6-NEXT: v_writelane_b32 v4, s30, 6 ; GFX6-NEXT: v_writelane_b32 v4, s31, 7 -; GFX6-NEXT: v_mov_b32_e32 v7, 0x2140 -; GFX6-NEXT: buffer_store_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Spill +; GFX6-NEXT: s_mov_b32 s38, 0x85400 +; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s38 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[44:45] -; GFX6-NEXT: s_mov_b64 s[44:45], exec +; GFX6-NEXT: s_mov_b64 exec, s[36:37] +; GFX6-NEXT: s_mov_b64 s[36:37], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff -; GFX6-NEXT: v_mov_b32_e32 v7, 0x2120 +; GFX6-NEXT: s_mov_b32 s38, 0x84c00 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s38 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_readlane_b32 s24, v4, 0 ; GFX6-NEXT: v_readlane_b32 s25, v4, 1 @@ -10368,8 +10380,8 @@ ; GFX6-NEXT: v_readlane_b32 s31, v4, 7 ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[44:45] -; GFX6-NEXT: s_mov_b64 s[44:45], exec +; GFX6-NEXT: s_mov_b64 exec, s[36:37] +; GFX6-NEXT: s_mov_b64 s[36:37], exec ; GFX6-NEXT: s_mov_b64 exec, 15 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -10377,12 +10389,12 @@ ; GFX6-NEXT: v_writelane_b32 v4, s1, 1 ; GFX6-NEXT: v_writelane_b32 v4, s2, 2 ; GFX6-NEXT: v_writelane_b32 v4, s3, 3 -; GFX6-NEXT: v_mov_b32_e32 v7, 0x2160 -; GFX6-NEXT: buffer_store_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Spill +; GFX6-NEXT: s_mov_b32 s38, 0x85c00 +; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s38 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[44:45] +; GFX6-NEXT: s_mov_b64 exec, s[36:37] ; GFX6-NEXT: s_mov_b64 s[0:1], exec ; GFX6-NEXT: s_mov_b64 exec, 15 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 @@ -10391,8 +10403,8 @@ ; GFX6-NEXT: v_writelane_b32 v4, s5, 1 ; GFX6-NEXT: v_writelane_b32 v4, s6, 2 ; GFX6-NEXT: v_writelane_b32 v4, s7, 3 -; GFX6-NEXT: s_mov_b32 s44, 0x85c00 -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s44 ; 4-byte Folded Spill +; GFX6-NEXT: s_mov_b32 s36, 0x86000 +; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s36 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -10403,18 +10415,18 @@ ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_writelane_b32 v4, s2, 0 ; GFX6-NEXT: v_writelane_b32 v4, s3, 1 -; GFX6-NEXT: s_mov_b32 s4, 0x86600 +; GFX6-NEXT: s_mov_b32 s4, 0x86400 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s4 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b64 exec, s[0:1] -; GFX6-NEXT: s_mov_b64 s[44:45], exec +; GFX6-NEXT: s_mov_b64 s[36:37], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff -; GFX6-NEXT: v_mov_b32_e32 v7, 0x2140 +; GFX6-NEXT: s_mov_b32 s38, 0x85400 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s38 ; 4-byte Folded Reload ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_readlane_b32 s0, v4, 0 ; GFX6-NEXT: v_readlane_b32 s1, v4, 1 @@ -10426,36 +10438,10 @@ ; GFX6-NEXT: v_readlane_b32 s7, v4, 7 ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[44:45] +; GFX6-NEXT: s_mov_b64 exec, s[36:37] ; GFX6-NEXT: s_mov_b64 s[44:45], exec ; GFX6-NEXT: s_mov_b64 exec, 15 -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_writelane_b32 v4, s36, 0 -; GFX6-NEXT: v_writelane_b32 v4, s37, 1 -; GFX6-NEXT: v_writelane_b32 v4, s38, 2 -; GFX6-NEXT: v_writelane_b32 v4, s39, 3 ; GFX6-NEXT: v_mov_b32_e32 v7, 0x2180 -; GFX6-NEXT: buffer_store_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[44:45] -; GFX6-NEXT: s_mov_b64 s[38:39], exec -; GFX6-NEXT: s_mov_b64 exec, 3 -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_writelane_b32 v4, s36, 0 -; GFX6-NEXT: v_writelane_b32 v4, s37, 1 -; GFX6-NEXT: s_mov_b32 s44, 0x86400 -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s44 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[38:39] -; GFX6-NEXT: s_mov_b64 s[44:45], exec -; GFX6-NEXT: s_mov_b64 exec, 15 -; GFX6-NEXT: v_mov_b32_e32 v7, 0x2170 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Reload @@ -10467,58 +10453,26 @@ ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b64 exec, s[44:45] -; GFX6-NEXT: s_not_b64 exec, exec -; GFX6-NEXT: v_mov_b32_e32 v7, 0x2190 -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Reload -; GFX6-NEXT: s_not_b64 exec, exec -; GFX6-NEXT: v_mov_b32_e32 v7, 0x2190 -; GFX6-NEXT: buffer_load_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Reload -; GFX6-NEXT: s_not_b64 exec, exec -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_readlane_b32 s44, v4, 0 -; GFX6-NEXT: v_readlane_b32 s45, v4, 1 -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_not_b64 exec, exec ; GFX6-NEXT: s_mov_b64 vcc, s[34:35] -; GFX6-NEXT: s_not_b64 exec, exec -; GFX6-NEXT: v_mov_b32_e32 v7, 0x2198 +; GFX6-NEXT: s_mov_b64 s[44:45], exec +; GFX6-NEXT: s_mov_b64 exec, 3 +; GFX6-NEXT: v_mov_b32_e32 v7, 0x2190 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Reload -; GFX6-NEXT: s_not_b64 exec, exec -; GFX6-NEXT: v_mov_b32_e32 v7, 0x2198 -; GFX6-NEXT: buffer_load_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Reload -; GFX6-NEXT: s_not_b64 exec, exec ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_readlane_b32 s34, v4, 0 ; GFX6-NEXT: v_readlane_b32 s35, v4, 1 ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_not_b64 exec, exec +; GFX6-NEXT: s_mov_b64 exec, s[44:45] ; GFX6-NEXT: ;;#ASMSTART -; GFX6-NEXT: ; use s[8:15],s[16:23],s[24:31],s[0:7],s[36:39],s[34:35],s[44:45] +; GFX6-NEXT: ; use s[8:15],s[16:23],s[24:31],s[0:7],s[36:39],s[34:35] ; GFX6-NEXT: ;;#ASMEND ; GFX6-NEXT: s_mov_b64 s[34:35], vcc -; GFX6-NEXT: s_mov_b64 s[0:1], exec -; GFX6-NEXT: s_mov_b64 exec, 15 -; GFX6-NEXT: s_mov_b32 s2, 0x86000 -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s2 ; 4-byte Folded Reload -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_readlane_b32 s36, v4, 0 -; GFX6-NEXT: v_readlane_b32 s37, v4, 1 -; GFX6-NEXT: v_readlane_b32 s38, v4, 2 -; GFX6-NEXT: v_readlane_b32 s39, v4, 3 -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: s_mov_b64 exec, s[0:1] ; GFX6-NEXT: s_mov_b64 s[4:5], exec ; GFX6-NEXT: s_mov_b64 exec, 15 -; GFX6-NEXT: s_mov_b32 s6, 0x85800 +; GFX6-NEXT: s_mov_b32 s6, 0x85c00 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s6 ; 4-byte Folded Reload @@ -10530,19 +10484,19 @@ ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b64 exec, s[4:5] -; GFX6-NEXT: s_mov_b32 s2, 0x83800 +; GFX6-NEXT: s_mov_b32 s2, 0x83c00 ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill -; GFX6-NEXT: s_mov_b32 s2, 0x84000 +; GFX6-NEXT: s_mov_b32 s2, 0x84400 ; GFX6-NEXT: buffer_store_dword v13, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v14, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v15, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v16, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill -; GFX6-NEXT: s_mov_b32 s2, 0x84800 +; GFX6-NEXT: s_mov_b32 s2, 0x84c00 ; GFX6-NEXT: buffer_store_dword v17, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v18, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill @@ -10555,12 +10509,12 @@ ; GFX6-NEXT: buffer_load_dword v18, off, s[40:43], s2 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v19, off, s[40:43], s2 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v20, off, s[40:43], s2 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s2, 0x84000 +; GFX6-NEXT: s_mov_b32 s2, 0x84400 ; GFX6-NEXT: buffer_load_dword v13, off, s[40:43], s2 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v14, off, s[40:43], s2 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v15, off, s[40:43], s2 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v16, off, s[40:43], s2 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s2, 0x83800 +; GFX6-NEXT: s_mov_b32 s2, 0x83c00 ; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s2 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Reload @@ -10579,14 +10533,28 @@ ; GFX6-NEXT: ;;#ASMEND ; GFX6-NEXT: .LBB1_2: ; %ret ; GFX6-NEXT: s_or_b64 exec, exec, s[34:35] -; GFX6-NEXT: s_mov_b32 s4, 0x83400 +; GFX6-NEXT: s_mov_b64 s[2:3], exec +; GFX6-NEXT: s_mov_b64 exec, 15 +; GFX6-NEXT: s_mov_b32 s8, 0x80400 +; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s8 ; 4-byte Folded Reload +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_readlane_b32 s4, v4, 0 +; GFX6-NEXT: v_readlane_b32 s5, v4, 1 +; GFX6-NEXT: v_readlane_b32 s6, v4, 2 +; GFX6-NEXT: v_readlane_b32 s7, v4, 3 +; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: s_mov_b64 exec, s[2:3] +; GFX6-NEXT: s_mov_b32 s4, 0x83800 ; GFX6-NEXT: v_lshl_b64 v[4:5], v[5:6], 8 ; GFX6-NEXT: buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX6-NEXT: s_mov_b32 s4, 0x83000 +; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] +; GFX6-NEXT: s_mov_b32 s4, 0x83400 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:240 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -10594,7 +10562,7 @@ ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x82c00 +; GFX6-NEXT: s_mov_b32 s4, 0x83000 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:224 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -10602,7 +10570,7 @@ ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x82800 +; GFX6-NEXT: s_mov_b32 s4, 0x82c00 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:208 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -10610,7 +10578,7 @@ ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x82400 +; GFX6-NEXT: s_mov_b32 s4, 0x82800 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:192 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -10618,7 +10586,7 @@ ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x82000 +; GFX6-NEXT: s_mov_b32 s4, 0x82400 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:176 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -10626,7 +10594,7 @@ ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x81c00 +; GFX6-NEXT: s_mov_b32 s4, 0x82000 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:160 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -10634,7 +10602,7 @@ ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x81800 +; GFX6-NEXT: s_mov_b32 s4, 0x81c00 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:144 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -10642,7 +10610,7 @@ ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x81400 +; GFX6-NEXT: s_mov_b32 s4, 0x81800 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:128 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -10650,7 +10618,7 @@ ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x81000 +; GFX6-NEXT: s_mov_b32 s4, 0x81400 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:112 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -10658,7 +10626,7 @@ ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x80c00 +; GFX6-NEXT: s_mov_b32 s4, 0x81000 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:96 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -10666,7 +10634,7 @@ ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x80400 +; GFX6-NEXT: s_mov_b32 s4, 0x80800 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:80 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -10674,7 +10642,7 @@ ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s4, 0x80800 +; GFX6-NEXT: s_mov_b32 s4, 0x80c00 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:64 ; GFX6-NEXT: buffer_store_dwordx4 v[17:20], v[4:5], s[0:3], 0 addr64 offset:48 @@ -10776,16 +10744,13 @@ ; GFX9-FLATSCR-NEXT: ; def s[38:39] ; GFX9-FLATSCR-NEXT: ;;#ASMEND ; GFX9-FLATSCR-NEXT: ;;#ASMSTART -; GFX9-FLATSCR-NEXT: ; def s[44:45] -; GFX9-FLATSCR-NEXT: ;;#ASMEND -; GFX9-FLATSCR-NEXT: ;;#ASMSTART ; GFX9-FLATSCR-NEXT: ; def s33 ; GFX9-FLATSCR-NEXT: ;;#ASMEND ; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[34:35], vcc ; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB1_2 ; GFX9-FLATSCR-NEXT: ; %bb.1: ; %bb0 ; GFX9-FLATSCR-NEXT: ;;#ASMSTART -; GFX9-FLATSCR-NEXT: ; use s[0:7],s[8:15],s[16:23],s[24:31],s[40:43],s[38:39],s[44:45] +; GFX9-FLATSCR-NEXT: ; use s[0:7],s[8:15],s[16:23],s[24:31],s[40:43],s[38:39] ; GFX9-FLATSCR-NEXT: ;;#ASMEND ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20d0 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill @@ -10930,16 +10895,13 @@ ; GFX10-FLATSCR-NEXT: ; def s[34:35] ; GFX10-FLATSCR-NEXT: ;;#ASMEND ; GFX10-FLATSCR-NEXT: ;;#ASMSTART -; GFX10-FLATSCR-NEXT: ; def s[38:39] -; GFX10-FLATSCR-NEXT: ;;#ASMEND -; GFX10-FLATSCR-NEXT: ;;#ASMSTART -; GFX10-FLATSCR-NEXT: ; def s44 +; GFX10-FLATSCR-NEXT: ; def s38 ; GFX10-FLATSCR-NEXT: ;;#ASMEND ; GFX10-FLATSCR-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX10-FLATSCR-NEXT: s_cbranch_execz .LBB1_2 ; GFX10-FLATSCR-NEXT: ; %bb.1: ; %bb0 ; GFX10-FLATSCR-NEXT: ;;#ASMSTART -; GFX10-FLATSCR-NEXT: ; use s[0:7],s[8:15],s[16:23],s[24:31],s[40:43],s[34:35],s[38:39] +; GFX10-FLATSCR-NEXT: ; use s[0:7],s[8:15],s[16:23],s[24:31],s[40:43],s[34:35] ; GFX10-FLATSCR-NEXT: ;;#ASMEND ; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x2010 ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v88, v59 @@ -11116,15 +11078,14 @@ %sgpr3 = call <8 x i32> asm sideeffect "; def $0", "=s" () %sgpr4 = call <4 x i32> asm sideeffect "; def $0", "=s" () %sgpr5 = call <2 x i32> asm sideeffect "; def $0", "=s" () - %sgpr6 = call <2 x i32> asm sideeffect "; def $0", "=s" () - %sgpr7 = call i32 asm sideeffect "; def $0", "=s" () + %sgpr6 = call i32 asm sideeffect "; def $0", "=s" () %cmp = icmp eq i32 %x, 0 br i1 %cmp, label %bb0, label %ret bb0: ; create SGPR pressure - call void asm sideeffect "; use $0,$1,$2,$3,$4,$5,$6", "s,s,s,s,s,s,s,s"(<8 x i32> %sgpr0, <8 x i32> %sgpr1, <8 x i32> %sgpr2, <8 x i32> %sgpr3, <4 x i32> %sgpr4, <2 x i32> %sgpr5, <2 x i32> %sgpr6, i32 %sgpr7) + call void asm sideeffect "; use $0,$1,$2,$3,$4,$5", "s,s,s,s,s,s,s"(<8 x i32> %sgpr0, <8 x i32> %sgpr1, <8 x i32> %sgpr2, <8 x i32> %sgpr3, <4 x i32> %sgpr4, <2 x i32> %sgpr5, i32 %sgpr6) ; mark most VGPR registers as used to increase register pressure call void asm sideeffect "", "~{v4},~{v8},~{v12},~{v16},~{v20},~{v24},~{v28},~{v32}" () diff --git a/llvm/test/CodeGen/AMDGPU/spill-sgpr-stack-no-sgpr.ll b/llvm/test/CodeGen/AMDGPU/spill-sgpr-stack-no-sgpr.ll --- a/llvm/test/CodeGen/AMDGPU/spill-sgpr-stack-no-sgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-sgpr-stack-no-sgpr.ll @@ -1,7 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s -; Spill an SGPR to scratch without having spare SGPRs available to save exec +; The test was originally written to spill an SGPR to scratch without having spare SGPRs +; available to save exec. This scenario won't be true anymore as we reseve SGPR(s) +; upfront for saving exec. define amdgpu_kernel void @test() #1 { ; GFX10-LABEL: test: @@ -18,44 +20,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s[8:12] ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_not_b64 exec, exec -; GFX10-NEXT: buffer_store_dword v0, off, s[8:11], 0 -; GFX10-NEXT: v_writelane_b32 v0, s8, 0 -; GFX10-NEXT: v_writelane_b32 v0, s9, 1 -; GFX10-NEXT: v_writelane_b32 v0, s10, 2 -; GFX10-NEXT: v_writelane_b32 v0, s11, 3 -; GFX10-NEXT: v_writelane_b32 v0, s12, 4 -; GFX10-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_not_b64 exec, exec -; GFX10-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_not_b64 exec, exec -; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_not_b64 exec, exec ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use s[0:7] ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: s_mov_b64 s[6:7], exec -; GFX10-NEXT: s_mov_b64 exec, 31 -; GFX10-NEXT: buffer_store_dword v0, off, s[8:11], 0 -; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_readlane_b32 s0, v0, 0 -; GFX10-NEXT: v_readlane_b32 s1, v0, 1 -; GFX10-NEXT: v_readlane_b32 s2, v0, 2 -; GFX10-NEXT: v_readlane_b32 s3, v0, 3 -; GFX10-NEXT: v_readlane_b32 s4, v0, 4 -; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b64 exec, s[6:7] ; GFX10-NEXT: ;;#ASMSTART -; GFX10-NEXT: ; use s[0:4] +; GFX10-NEXT: ; use s[8:12] ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm %wide.sgpr0 = call <8 x i32> asm sideeffect "; def $0", "={s[0:7]}" () #0 diff --git a/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll b/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll --- a/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll @@ -10,16 +10,16 @@ ; GCN-LABEL: sgpr_spill_writelane: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_xor_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: v_writelane_b32 v0, s35, 0 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: v_readlane_b32 s35, v0, 0 -; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_xor_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] call void asm sideeffect "", "~{s35}"() diff --git a/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll b/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll --- a/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll @@ -39,6 +39,7 @@ ; CHECK-NEXT: occupancy: 5 ; CHECK-NEXT: scavengeFI: '%fixed-stack.0' ; CHECK-NEXT: vgprForAGPRCopy: '' +; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101' ; CHECK-NEXT: longBranchReservedReg: '' ; CHECK-NEXT: body: define amdgpu_kernel void @long_branch_used_all_sgprs(ptr addrspace(1) %arg, i32 %cnd) #0 { @@ -300,8 +301,10 @@ ; CHECK-NEXT: fp64-fp16-output-denormals: true ; CHECK-NEXT: highBitsOf32BitAddress: 0 ; CHECK-NEXT: occupancy: 5 +; CHECK-NEXT: scavengeFI: '%fixed-stack.0' ; CHECK-NEXT: vgprForAGPRCopy: '' -; CHECK-NEXT: longBranchReservedReg: '$sgpr100_sgpr101' +; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101' +; CHECK-NEXT: longBranchReservedReg: '' ; CHECK-NEXT: body: define amdgpu_kernel void @long_branch_high_num_sgprs_used(ptr addrspace(1) %arg, i32 %cnd) #0 { entry: @@ -406,8 +409,6 @@ %sgpr95 = tail call i32 asm sideeffect "s_mov_b32 s95, 0", "={s95}"() #1 %sgpr96 = tail call i32 asm sideeffect "s_mov_b32 s96, 0", "={s96}"() #1 %sgpr97 = tail call i32 asm sideeffect "s_mov_b32 s97, 0", "={s97}"() #1 - %sgpr98 = tail call i32 asm sideeffect "s_mov_b32 s98, 0", "={s98}"() #1 - %sgpr99 = tail call i32 asm sideeffect "s_mov_b32 s99, 0", "={s99}"() #1 %vcc_lo = tail call i32 asm sideeffect "s_mov_b32 $0, 0", "={vcc_lo}"() #1 %vcc_hi = tail call i32 asm sideeffect "s_mov_b32 $0, 0", "={vcc_hi}"() #1 %cmp = icmp ne i32 %cnd.load, 0 @@ -516,8 +517,6 @@ tail call void asm sideeffect "; reg use $0", "{s95}"(i32 %sgpr95) #1 tail call void asm sideeffect "; reg use $0", "{s96}"(i32 %sgpr96) #1 tail call void asm sideeffect "; reg use $0", "{s97}"(i32 %sgpr97) #1 - tail call void asm sideeffect "; reg use $0", "{s98}"(i32 %sgpr98) #1 - tail call void asm sideeffect "; reg use $0", "{s99}"(i32 %sgpr99) #1 tail call void asm sideeffect "; reg use $0", "{vcc_lo}"(i32 %vcc_lo) #1 tail call void asm sideeffect "; reg use $0", "{vcc_hi}"(i32 %vcc_hi) #1 ret void diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll @@ -39,6 +39,7 @@ ; AFTER-PEI-NEXT: occupancy: 5 ; AFTER-PEI-NEXT: scavengeFI: '%fixed-stack.0' ; AFTER-PEI-NEXT: vgprForAGPRCopy: '' +; AFTER-PEI-NEXT: sgprForEXECCopy: '' ; AFTER-PEI-NEXT: longBranchReservedReg: '' ; AFTER-PEI-NEXT: body: define amdgpu_kernel void @scavenge_fi(ptr addrspace(1) %out, i32 %in) #0 { diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll @@ -39,6 +39,7 @@ ; CHECK-NEXT: BitsOf32BitAddress: 0 ; CHECK-NEXT: occupancy: 8 ; CHECK-NEXT: vgprForAGPRCopy: '' +; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101' ; CHECK-NEXT: longBranchReservedReg: '$sgpr2_sgpr3' ; CHECK-NEXT: body: define amdgpu_kernel void @uniform_long_forward_branch_debug(ptr addrspace(1) %arg, i32 %arg1) #0 !dbg !5 { diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll @@ -39,6 +39,7 @@ ; CHECK-NEXT: BitsOf32BitAddress: 0 ; CHECK-NEXT: occupancy: 8 ; CHECK-NEXT: vgprForAGPRCopy: '' +; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101' ; CHECK-NEXT: longBranchReservedReg: '$sgpr2_sgpr3' ; CHECK-NEXT: body: define amdgpu_kernel void @uniform_long_forward_branch(ptr addrspace(1) %arg, i32 %arg1) { diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir @@ -48,6 +48,7 @@ # FULL-NEXT: highBitsOf32BitAddress: 0 # FULL-NEXT: occupancy: 8 # FULL-NEXT: vgprForAGPRCopy: '' +# FULL-NEXT: sgprForEXECCopy: '' # FULL-NEXT: longBranchReservedReg: '' # FULL-NEXT: body: @@ -149,6 +150,7 @@ # FULL-NEXT: highBitsOf32BitAddress: 0 # FULL-NEXT: occupancy: 8 # FULL-NEXT: vgprForAGPRCopy: '' +# FULL-NEXT: sgprForEXECCopy: '' # FULL-NEXT: longBranchReservedReg: '' # FULL-NEXT: body: @@ -221,6 +223,7 @@ # FULL-NEXT: highBitsOf32BitAddress: 0 # FULL-NEXT: occupancy: 8 # FULL-NEXT: vgprForAGPRCopy: '' +# FULL-NEXT: sgprForEXECCopy: '' # FULL-NEXT: longBranchReservedReg: '' # FULL-NEXT: body: @@ -294,6 +297,7 @@ # FULL-NEXT: highBitsOf32BitAddress: 0 # FULL-NEXT: occupancy: 8 # FULL-NEXT: vgprForAGPRCopy: '' +# FULL-NEXT: sgprForEXECCopy: '' # FULL-NEXT: longBranchReservedReg: '' # FULL-NEXT: body: @@ -541,3 +545,28 @@ SI_RETURN ... + +--- +# ALL-LABEL: name: sgpr_for_exec_copy +# ALL: sgprForEXECCopy: '$sgpr2_sgpr3' +name: sgpr_for_exec_copy +machineFunctionInfo: + sgprForEXECCopy: '$sgpr2_sgpr3' +body: | + bb.0: + SI_RETURN + +... + +--- +# ALL-LABEL: name: sgpr_for_exec_copy_noreg +# FULL: sgprForEXECCopy: '' +# SIMPLE-NOT: sgprForEXECCopy +name: sgpr_for_exec_copy_noreg +machineFunctionInfo: + sgprForEXECCopy: '$noreg' +body: | + bb.0: + SI_RETURN + +... diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll @@ -42,6 +42,7 @@ ; CHECK-NEXT: highBitsOf32BitAddress: 0 ; CHECK-NEXT: occupancy: 8 ; CHECK-NEXT: vgprForAGPRCopy: '' +; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101' ; CHECK-NEXT: longBranchReservedReg: '' ; CHECK-NEXT: body: define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) { @@ -85,6 +86,7 @@ ; CHECK-NEXT: highBitsOf32BitAddress: 0 ; CHECK-NEXT: occupancy: 10 ; CHECK-NEXT: vgprForAGPRCopy: '' +; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101' ; CHECK-NEXT: longBranchReservedReg: '' ; CHECK-NEXT: body: define amdgpu_ps void @ps_shader(i32 %arg0, i32 inreg %arg1) { @@ -152,6 +154,7 @@ ; CHECK-NEXT: highBitsOf32BitAddress: 0 ; CHECK-NEXT: occupancy: 8 ; CHECK-NEXT: vgprForAGPRCopy: '' +; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101' ; CHECK-NEXT: longBranchReservedReg: '' ; CHECK-NEXT: body: define void @function() { @@ -201,6 +204,7 @@ ; CHECK-NEXT: highBitsOf32BitAddress: 0 ; CHECK-NEXT: occupancy: 8 ; CHECK-NEXT: vgprForAGPRCopy: '' +; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101' ; CHECK-NEXT: longBranchReservedReg: '' ; CHECK-NEXT: body: define void @function_nsz() #0 { diff --git a/llvm/test/CodeGen/MIR/AMDGPU/sgpr-for-exec-copy-invalid-reg.mir b/llvm/test/CodeGen/MIR/AMDGPU/sgpr-for-exec-copy-invalid-reg.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/MIR/AMDGPU/sgpr-for-exec-copy-invalid-reg.mir @@ -0,0 +1,12 @@ +# RUN: not llc -mtriple=amdgcn-amd-amdhsa -run-pass=none %s -o /dev/null 2>&1 | FileCheck -check-prefix=ERR %s + +--- +name: invalid_reg +machineFunctionInfo: +# ERR: [[@LINE+1]]:21: unknown register name 'srst' + sgprForEXECCopy: '$srst' +body: | + bb.0: + S_ENDPGM 0 + +...