Index: lib/Target/AMDGPU/SIFrameLowering.h =================================================================== --- lib/Target/AMDGPU/SIFrameLowering.h +++ lib/Target/AMDGPU/SIFrameLowering.h @@ -38,6 +38,10 @@ RegScavenger *RS = nullptr) const override; void determineCalleeSavesSGPR(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS = nullptr) const; + bool + assignCalleeSavedSpillSlots(MachineFunction &MF, + const TargetRegisterInfo *TRI, + std::vector &CSI) const override; bool isSupportedStackID(TargetStackID::Value ID) const override; Index: lib/Target/AMDGPU/SIFrameLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIFrameLowering.cpp +++ lib/Target/AMDGPU/SIFrameLowering.cpp @@ -21,6 +21,7 @@ using namespace llvm; +#define DEBUG_TYPE "frame-info" static ArrayRef getAllSGPR128(const GCNSubtarget &ST, const MachineFunction &MF) { @@ -34,6 +35,150 @@ ST.getMaxNumSGPRs(MF)); } +// Find a scratch register that we can use at the start of the prologue to +// re-align the stack pointer. We avoid using callee-save registers since they +// may appear to be free when this is called from canUseAsPrologue (during +// shrink wrapping), but then no longer be free when this is called from +// emitPrologue. +// +// FIXME: This is a bit conservative, since in the above case we could use one +// of the callee-save registers as a scratch temp to re-align the stack pointer, +// but we would then have to make sure that we were in fact saving at least one +// callee-save register in the prologue, which is additional complexity that +// doesn't seem worth the benefit. +static unsigned findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI, + LivePhysRegs &LiveRegs, + const TargetRegisterClass &RC, + bool Unused = false) { + // Mark callee saved registers as used so we will not choose them. + const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs(); + for (unsigned i = 0; CSRegs[i]; ++i) + LiveRegs.addReg(CSRegs[i]); + + if (Unused) { + // We are looking for a register that can be used throughout the entire + // function, so any use is unacceptable. + for (unsigned Reg : RC) { + if (!MRI.isPhysRegUsed(Reg) && LiveRegs.available(MRI, Reg)) + return Reg; + } + } else { + for (unsigned Reg : RC) { + if (LiveRegs.available(MRI, Reg)) + return Reg; + } + } + + // If we require an unused register, this is used in contexts where failure is + // an option and has an alternative plan. In other contexts, this must + // succeed0. + if (!Unused) + report_fatal_error("failed to find free scratch register"); + + return AMDGPU::NoRegister; +} + +static MCPhysReg findUnusedSGPRNonCalleeSaved(MachineRegisterInfo &MRI) { + LivePhysRegs LiveRegs; + LiveRegs.init(*MRI.getTargetRegisterInfo()); + return findScratchNonCalleeSaveRegister( + MRI, LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass, true); +} + +// We need to specially emit stack operations here because a different frame +// register is used than in the rest of the function, as getFrameRegister would +// use. +static void buildPrologSpill(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const SIInstrInfo *TII, unsigned SpillReg, + unsigned ScratchRsrcReg, unsigned SPReg, int FI) { + MachineFunction *MF = MBB.getParent(); + MachineFrameInfo &MFI = MF->getFrameInfo(); + + int64_t Offset = MFI.getObjectOffset(FI); + + MachineMemOperand *MMO = MF->getMachineMemOperand( + MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore, 4, + MFI.getObjectAlignment(FI)); + + if (isUInt<12>(Offset)) { + BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFSET)) + .addReg(SpillReg, RegState::Kill) + .addReg(ScratchRsrcReg) + .addReg(SPReg) + .addImm(Offset) + .addImm(0) // glc + .addImm(0) // slc + .addImm(0) // tfe + .addImm(0) // dlc + .addMemOperand(MMO); + return; + } + + MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister( + MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass); + + BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg) + .addImm(Offset); + + BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFEN)) + .addReg(SpillReg, RegState::Kill) + .addReg(OffsetReg, RegState::Kill) + .addReg(ScratchRsrcReg) + .addReg(SPReg) + .addImm(0) + .addImm(0) // glc + .addImm(0) // slc + .addImm(0) // tfe + .addImm(0) // dlc + .addMemOperand(MMO); +} + +static void buildEpilogReload(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const SIInstrInfo *TII, unsigned SpillReg, + unsigned ScratchRsrcReg, unsigned SPReg, int FI) { + MachineFunction *MF = MBB.getParent(); + MachineFrameInfo &MFI = MF->getFrameInfo(); + int64_t Offset = MFI.getObjectOffset(FI); + + MachineMemOperand *MMO = MF->getMachineMemOperand( + MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad, 4, + MFI.getObjectAlignment(FI)); + + if (isUInt<12>(Offset)) { + BuildMI(MBB, I, DebugLoc(), + TII->get(AMDGPU::BUFFER_LOAD_DWORD_OFFSET), SpillReg) + .addReg(ScratchRsrcReg) + .addReg(SPReg) + .addImm(Offset) + .addImm(0) // glc + .addImm(0) // slc + .addImm(0) // tfe + .addImm(0) // dlc + .addMemOperand(MMO); + return; + } + + MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister( + MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass); + + BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg) + .addImm(Offset); + + BuildMI(MBB, I, DebugLoc(), + TII->get(AMDGPU::BUFFER_LOAD_DWORD_OFFEN), SpillReg) + .addReg(OffsetReg, RegState::Kill) + .addReg(ScratchRsrcReg) + .addReg(SPReg) + .addImm(0) + .addImm(0) // glc + .addImm(0) // slc + .addImm(0) // tfe + .addImm(0) // dlc + .addMemOperand(MMO); +} + void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST, MachineFunction &MF, MachineBasicBlock &MBB) const { @@ -512,35 +657,6 @@ } } -// Find a scratch register that we can use at the start of the prologue to -// re-align the stack pointer. We avoid using callee-save registers since they -// may appear to be free when this is called from canUseAsPrologue (during -// shrink wrapping), but then no longer be free when this is called from -// emitPrologue. -// -// FIXME: This is a bit conservative, since in the above case we could use one -// of the callee-save registers as a scratch temp to re-align the stack pointer, -// but we would then have to make sure that we were in fact saving at least one -// callee-save register in the prologue, which is additional complexity that -// doesn't seem worth the benefit. -static unsigned findScratchNonCalleeSaveRegister(MachineFunction &MF, - LivePhysRegs &LiveRegs, - const TargetRegisterClass &RC) { - MachineRegisterInfo &MRI = MF.getRegInfo(); - - // Mark callee saved registers as used so we will not choose them. - const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs(); - for (unsigned i = 0; CSRegs[i]; ++i) - LiveRegs.addReg(CSRegs[i]); - - for (unsigned Reg : RC) { - if (LiveRegs.available(MRI, Reg)) - return Reg; - } - - return AMDGPU::NoRegister; -} - bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const { switch (ID) { case TargetStackID::Default: @@ -560,6 +676,7 @@ } const MachineFrameInfo &MFI = MF.getFrameInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); const GCNSubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo &TRI = TII->getRegisterInfo(); @@ -574,20 +691,90 @@ bool HasFP = false; uint32_t NumBytes = MFI.getStackSize(); uint32_t RoundedSize = NumBytes; + // To avoid clobbering VGPRs in lanes that weren't active on function entry, + // turn on all lanes before doing the spill to memory. + unsigned ScratchExecCopy = AMDGPU::NoRegister; + + // Emit the copy if we need an FP, and are using a free SGPR to save it. + if (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister) { + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FuncInfo->SGPRForFPSaveRestoreCopy) + .addReg(FramePtrReg) + .setMIFlag(MachineInstr::FrameSetup); + } + + for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg + : FuncInfo->getSGPRSpillVGPRs()) { + if (!Reg.FI.hasValue()) + continue; + + if (ScratchExecCopy == AMDGPU::NoRegister) { + if (LiveRegs.empty()) { + LiveRegs.init(TRI); + LiveRegs.addLiveIns(MBB); + if (FuncInfo->SGPRForFPSaveRestoreCopy) + LiveRegs.removeReg(FuncInfo->SGPRForFPSaveRestoreCopy); + } + + ScratchExecCopy + = findScratchNonCalleeSaveRegister(MRI, LiveRegs, + *TRI.getWaveMaskRegClass()); + assert(FuncInfo->SGPRForFPSaveRestoreCopy != ScratchExecCopy); + + const unsigned OrSaveExec = ST.isWave32() ? + AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64; + BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), + ScratchExecCopy) + .addImm(-1); + } + + buildPrologSpill(LiveRegs, MBB, MBBI, TII, Reg.VGPR, + FuncInfo->getScratchRSrcReg(), + StackPtrReg, + Reg.FI.getValue()); + } + + if (ScratchExecCopy != AMDGPU::NoRegister) { + // FIXME: Split block and make terminator. + unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec) + .addReg(ScratchExecCopy, RegState::Kill); + LiveRegs.addReg(ScratchExecCopy); + } + + + if (FuncInfo->FramePointerSaveIndex) { + const int FI = FuncInfo->FramePointerSaveIndex.getValue(); + assert(!MFI.isDeadObjectIndex(FI) && + MFI.getStackID(FI) == TargetStackID::SGPRSpill); + ArrayRef Spill + = FuncInfo->getSGPRToVGPRSpills(FI); + assert(Spill.size() == 1); + + // Save FP before setting it up. + // FIXME: This should respect spillSGPRToVGPR; + BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), + Spill[0].VGPR) + .addReg(FramePtrReg) + .addImm(Spill[0].Lane) + .addReg(Spill[0].VGPR, RegState::Undef); + } if (TRI.needsStackRealignment(MF)) { HasFP = true; const unsigned Alignment = MFI.getMaxAlignment(); RoundedSize += Alignment; + if (LiveRegs.empty()) { + LiveRegs.init(TRI); + LiveRegs.addLiveIns(MBB); + LiveRegs.addReg(FuncInfo->SGPRForFPSaveRestoreCopy); + } - LiveRegs.init(TRI); - LiveRegs.addLiveIns(MBB); - - unsigned ScratchSPReg - = findScratchNonCalleeSaveRegister(MF, LiveRegs, - AMDGPU::SReg_32_XM0RegClass); - assert(ScratchSPReg != AMDGPU::NoRegister); + unsigned ScratchSPReg = findScratchNonCalleeSaveRegister( + MRI, LiveRegs, AMDGPU::SReg_32_XM0RegClass); + assert(ScratchSPReg != AMDGPU::NoRegister && + ScratchSPReg != FuncInfo->SGPRForFPSaveRestoreCopy); // s_add_u32 tmp_reg, s32, NumBytes // s_and_b32 s32, tmp_reg, 0b111...0000 @@ -617,44 +804,13 @@ .setMIFlag(MachineInstr::FrameSetup); } - // To avoid clobbering VGPRs in lanes that weren't active on function entry, - // turn on all lanes before doing the spill to memory. - unsigned ScratchExecCopy = AMDGPU::NoRegister; - - for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg - : FuncInfo->getSGPRSpillVGPRs()) { - if (!Reg.FI.hasValue()) - continue; + assert(!HasFP || (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister || + FuncInfo->FramePointerSaveIndex) && + "Needed to save FP but didn't save it anywhere"); - if (ScratchExecCopy == AMDGPU::NoRegister) { - if (LiveRegs.empty()) { - LiveRegs.init(TRI); - LiveRegs.addLiveIns(MBB); - } - - ScratchExecCopy - = findScratchNonCalleeSaveRegister(MF, LiveRegs, - *TRI.getWaveMaskRegClass()); - - const unsigned OrSaveExec = ST.isWave32() ? - AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64; - BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), - ScratchExecCopy) - .addImm(-1); - } - - TII->storeRegToStackSlot(MBB, MBBI, Reg.VGPR, true, - Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass, - &TII->getRegisterInfo()); - } - - if (ScratchExecCopy != AMDGPU::NoRegister) { - // FIXME: Split block and make terminator. - unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec) - .addReg(ScratchExecCopy); - } + assert(HasFP || (FuncInfo->SGPRForFPSaveRestoreCopy == AMDGPU::NoRegister && + !FuncInfo->FramePointerSaveIndex) && + "Saved FP but didn't need it"); } void SIFrameLowering::emitEpilogue(MachineFunction &MF, @@ -665,9 +821,45 @@ const GCNSubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); + LivePhysRegs LiveRegs; DebugLoc DL; + const MachineFrameInfo &MFI = MF.getFrameInfo(); + uint32_t NumBytes = MFI.getStackSize(); + uint32_t RoundedSize = FuncInfo->isStackRealigned() ? + NumBytes + MFI.getMaxAlignment() : NumBytes; + + if (RoundedSize != 0 && hasFP(MF)) { + const unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg(); + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg) + .addReg(StackPtrReg) + .addImm(RoundedSize * ST.getWavefrontSize()) + .setMIFlag(MachineInstr::FrameDestroy); + } + + if (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister) { + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FuncInfo->getFrameOffsetReg()) + .addReg(FuncInfo->SGPRForFPSaveRestoreCopy) + .setMIFlag(MachineInstr::FrameSetup); + } + + if (FuncInfo->FramePointerSaveIndex) { + const int FI = FuncInfo->FramePointerSaveIndex.getValue(); + const MachineFrameInfo &MFI = MF.getFrameInfo(); + + assert(!MFI.isDeadObjectIndex(FI)); + assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); + ArrayRef Spill + = FuncInfo->getSGPRToVGPRSpills(FI); + assert(Spill.size() == 1); + BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), + FuncInfo->getFrameOffsetReg()) + .addReg(Spill[0].VGPR) + .addImm(Spill[0].Lane); + } + unsigned ScratchExecCopy = AMDGPU::NoRegister; for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg : FuncInfo->getSGPRSpillVGPRs()) { @@ -677,24 +869,26 @@ const SIRegisterInfo &TRI = TII->getRegisterInfo(); if (ScratchExecCopy == AMDGPU::NoRegister) { // See emitPrologue - LivePhysRegs LiveRegs(*ST.getRegisterInfo()); - LiveRegs.addLiveOuts(MBB); - LiveRegs.stepBackward(*MBBI); + if (LiveRegs.empty()) { + LiveRegs.init(*ST.getRegisterInfo()); + LiveRegs.addLiveOuts(MBB); + LiveRegs.stepBackward(*MBBI); + } - ScratchExecCopy - = findScratchNonCalleeSaveRegister(MF, LiveRegs, - *TRI.getWaveMaskRegClass()); + ScratchExecCopy = findScratchNonCalleeSaveRegister( + MRI, LiveRegs, *TRI.getWaveMaskRegClass()); + LiveRegs.removeReg(ScratchExecCopy); - const unsigned OrSaveExec = ST.isWave32() ? - AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64; + const unsigned OrSaveExec = + ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64; BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), ScratchExecCopy) .addImm(-1); } - TII->loadRegFromStackSlot(MBB, MBBI, Reg.VGPR, - Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass, - &TII->getRegisterInfo()); + buildEpilogReload(LiveRegs, MBB, MBBI, TII, Reg.VGPR, + FuncInfo->getScratchRSrcReg(), + FuncInfo->getStackPtrOffsetReg(), Reg.FI.getValue()); } if (ScratchExecCopy != AMDGPU::NoRegister) { @@ -702,25 +896,12 @@ unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec) - .addReg(ScratchExecCopy); - } - - const MachineFrameInfo &MFI = MF.getFrameInfo(); - uint32_t NumBytes = MFI.getStackSize(); - uint32_t RoundedSize = FuncInfo->isStackRealigned() ? - NumBytes + MFI.getMaxAlignment() : NumBytes; - - if (RoundedSize != 0 && hasFP(MF)) { - const unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg(); - BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg) - .addReg(StackPtrReg) - .addImm(RoundedSize * ST.getWavefrontSize()) - .setMIFlag(MachineInstr::FrameDestroy); + .addReg(ScratchExecCopy, RegState::Kill); } } // Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not -// memory. +// memory. They should have been removed by now. static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) { for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); I != E; ++I) { @@ -744,15 +925,11 @@ RegScavenger *RS) const { MachineFrameInfo &MFI = MF.getFrameInfo(); - if (!MFI.hasStackObjects()) - return; - const GCNSubtarget &ST = MF.getSubtarget(); - const SIInstrInfo *TII = ST.getInstrInfo(); - const SIRegisterInfo &TRI = TII->getRegisterInfo(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); SIMachineFunctionInfo *FuncInfo = MF.getInfo(); - FuncInfo->removeSGPRToVGPRFrameIndices(MFI); + assert(allSGPRSpillsAreDead(MFI, FuncInfo->FramePointerSaveIndex)); // FIXME: The other checks should be redundant with allStackObjectsAreDead, // but currently hasNonSpillStackObjects is set only from source @@ -762,12 +939,12 @@ if (FuncInfo->isEntryFunction()) { int ScavengeFI = MFI.CreateFixedObject( - TRI.getSpillSize(AMDGPU::SGPR_32RegClass), 0, false); + TRI->getSpillSize(AMDGPU::SGPR_32RegClass), 0, false); RS->addScavengingFrameIndex(ScavengeFI); } else { int ScavengeFI = MFI.CreateStackObject( - TRI.getSpillSize(AMDGPU::SGPR_32RegClass), - TRI.getSpillAlignment(AMDGPU::SGPR_32RegClass), + TRI->getSpillSize(AMDGPU::SGPR_32RegClass), + TRI->getSpillAlignment(AMDGPU::SGPR_32RegClass), false); RS->addScavengingFrameIndex(ScavengeFI); } @@ -776,17 +953,76 @@ // Only report VGPRs to generic code. void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, - BitVector &SavedRegs, + BitVector &SavedVGPRs, RegScavenger *RS) const { - TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); + TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS); + + SIMachineFunctionInfo *MFI = MF.getInfo(); + const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); const GCNSubtarget &ST = MF.getSubtarget(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); - SavedRegs.clearBitsNotInMask(TRI->getAllVGPRRegMask()); - // VGPRs used for SGPR spilling need to be specially inserted in the prolog. - const SIMachineFunctionInfo *MFI = MF.getInfo(); + // Ignore the SGPRs the default implementation found. + SavedVGPRs.clearBitsNotInMask(TRI->getAllVGPRRegMask()); + + // hasFP only knows about stack objects that already exist. We're now + // determining the stack slots that will be created, so we have to predict + // them. Stack objects force FP usage with calls. + // + // Note a new VGPR CSR may be introduced if one is used for the spill, but we + // don't want to report it here. + // + // FIXME: Is this really hasReservedCallFrame? + const bool WillHaveFP = + FrameInfo.hasCalls() && + (SavedVGPRs.any() || !allStackObjectsAreDead(FrameInfo)); + + // VGPRs used for SGPR spilling need to be specially inserted in the prolog, + // so don't allow the default insertion to handle them. for (auto SSpill : MFI->getSGPRSpillVGPRs()) - SavedRegs.reset(SSpill.VGPR); + SavedVGPRs.reset(SSpill.VGPR); + + const bool HasFP = WillHaveFP || hasFP(MF); + if (!HasFP) + return; + + if (MFI->haveFreeLanesForSGPRSpill(MF, 1)) { + int NewFI = MF.getFrameInfo().CreateStackObject(4, 4, true, nullptr, + TargetStackID::SGPRSpill); + + // If there is already a VGPR with free lanes, use it. We may already have + // to pay the penalty for spilling a CSR VGPR. + if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI)) + llvm_unreachable("allocate SGPR spill should have worked"); + + MFI->FramePointerSaveIndex = NewFI; + + LLVM_DEBUG( + auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front(); + dbgs() << "Spilling FP to " << printReg(Spill.VGPR, TRI) + << ':' << Spill.Lane << '\n'); + return; + } + + MFI->SGPRForFPSaveRestoreCopy = findUnusedSGPRNonCalleeSaved(MF.getRegInfo()); + + if (!MFI->SGPRForFPSaveRestoreCopy) { + // There's no free lane to spill, and no free register to save FP, so we're + // forced to spill another VGPR to use for the spill. + int NewFI = MF.getFrameInfo().CreateStackObject(4, 4, true, nullptr, + TargetStackID::SGPRSpill); + if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI)) + llvm_unreachable("allocate SGPR spill should have worked"); + MFI->FramePointerSaveIndex = NewFI; + + LLVM_DEBUG( + auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front(); + dbgs() << "FP requires fallback spill to " << printReg(Spill.VGPR, TRI) + << ':' << Spill.Lane << '\n';); + } else { + LLVM_DEBUG(dbgs() << "Saving FP with copy to " << + printReg(MFI->SGPRForFPSaveRestoreCopy, TRI) << '\n'); + } } void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF, @@ -803,6 +1039,27 @@ SavedRegs.clearBitsInMask(TRI->getAllVGPRRegMask()); } +bool SIFrameLowering::assignCalleeSavedSpillSlots( + MachineFunction &MF, const TargetRegisterInfo *TRI, + std::vector &CSI) const { + if (CSI.empty()) + return true; // Early exit if no callee saved registers are modified! + + const SIMachineFunctionInfo *FuncInfo = MF.getInfo(); + if (!FuncInfo->SGPRForFPSaveRestoreCopy) + return false; + + for (auto &CS : CSI) { + if (CS.getReg() == FuncInfo->getFrameOffsetReg()) { + if (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister) + CS.setDstReg(FuncInfo->SGPRForFPSaveRestoreCopy); + break; + } + } + + return false; +} + MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr( MachineFunction &MF, MachineBasicBlock &MBB, @@ -842,6 +1099,9 @@ if (MFI.hasCalls()) { // All offsets are unsigned, so need to be addressed in the same direction // as stack growth. + + // FIXME: This function is pretty broken, since it can be called before the + // frame layout is determined or CSR spills are inserted. if (MFI.getStackSize() != 0) return true; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -2616,8 +2616,6 @@ MachineFrameInfo &MFI = MF.getFrameInfo(); SmallVector, 8> RegsToPass; - SDValue CallerSavedFP; - // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass if (!IsSibCall) { @@ -2630,15 +2628,6 @@ = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32); RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg); CopyFromChains.push_back(ScratchRSrcReg.getValue(1)); - - if (!Info->isEntryFunction()) { - // Avoid clobbering this function's FP value. In the current convention - // callee will overwrite this, so do save/restore around the call site. - CallerSavedFP = DAG.getCopyFromReg(Chain, DL, - Info->getFrameOffsetReg(), MVT::i32); - CopyFromChains.push_back(CallerSavedFP.getValue(1)); - } - Chain = DAG.getTokenFactor(DL, CopyFromChains); } @@ -2827,12 +2816,6 @@ Chain = Call.getValue(0); InFlag = Call.getValue(1); - if (CallerSavedFP) { - SDValue FPReg = DAG.getRegister(Info->getFrameOffsetReg(), MVT::i32); - Chain = DAG.getCopyToReg(Chain, DL, FPReg, CallerSavedFP, InFlag); - InFlag = Chain.getValue(1); - } - uint64_t CalleePopBytes = NumBytes; Chain = DAG.getCALLSEQ_END(Chain, DAG.getTargetConstant(0, DL, MVT::i32), DAG.getTargetConstant(CalleePopBytes, DL, MVT::i32), Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -957,8 +957,8 @@ // Add the scratch resource registers as implicit uses because we may end up // needing them, and need to ensure that the reserved registers are // correctly handled. - - FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill); + if (RI.spillSGPRToVGPR()) + FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill); if (ST.hasScalarStores()) { // m0 is used for offset to scalar stores if used to spill. Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead); @@ -1052,7 +1052,8 @@ MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass); } - FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill); + if (RI.spillSGPRToVGPR()) + FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill); MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg) .addFrameIndex(FrameIndex) // addr .addMemOperand(MMO) Index: lib/Target/AMDGPU/SIMachineFunctionInfo.h =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -307,6 +307,12 @@ unsigned NumVGPRSpillLanes = 0; SmallVector SpillVGPRs; +public: // FIXME + /// If this is set, an SGPR used for save/restore of the register used for the + /// frame pointer. + unsigned SGPRForFPSaveRestoreCopy = 0; + Optional FramePointerSaveIndex; + public: SIMachineFunctionInfo(const MachineFunction &MF); @@ -326,6 +332,8 @@ return Mode; } + bool haveFreeLanesForSGPRSpill(const MachineFunction &MF, + unsigned NumLane) const; bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI); void removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI); Index: lib/Target/AMDGPU/SIMachineFunctionInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -70,7 +70,9 @@ // required for scratch access. ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3; ScratchWaveOffsetReg = AMDGPU::SGPR33; - FrameOffsetReg = AMDGPU::SGPR5; + + // TODO: Pick a high register, and shift down, similar to a kernel.wwwwwwwwwwww + FrameOffsetReg = AMDGPU::SGPR34; StackPtrOffsetReg = AMDGPU::SGPR32; ArgInfo.PrivateSegmentBuffer = @@ -229,6 +231,17 @@ return false; } +/// \p returns true if \p NumLanes slots are available in VGPRs already used for +/// SGPR spilling. +// +// FIXME: This only works after processFunctionBeforeFrameFinalized +bool SIMachineFunctionInfo::haveFreeLanesForSGPRSpill(const MachineFunction &MF, + unsigned NumNeed) const { + const GCNSubtarget &ST = MF.getSubtarget(); + unsigned WaveSize = ST.getWavefrontSize(); + return NumVGPRSpillLanes + NumNeed <= WaveSize * SpillVGPRs.size(); +} + /// Reserve a slice of a VGPR to support spilling for FrameIndex \p FI. bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF, int FI) { @@ -291,13 +304,18 @@ } void SIMachineFunctionInfo::removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI) { - for (auto &R : SGPRToVGPRSpills) - MFI.RemoveStackObject(R.first); - // All other SPGRs must be allocated on the default stack, so reset - // the stack ID. - for (unsigned i = MFI.getObjectIndexBegin(), e = MFI.getObjectIndexEnd(); - i != e; ++i) - MFI.setStackID(i, 0); + // The FP spill hasn't been inserted yet, so keep it around. + for (auto &R : SGPRToVGPRSpills) { + if (R.first != FramePointerSaveIndex) + MFI.RemoveStackObject(R.first); + } + + // All other SPGRs must be allocated on the default stack, so reset the stack + // ID. + for (int i = MFI.getObjectIndexBegin(), e = MFI.getObjectIndexEnd(); i != e; + ++i) + if (i != FramePointerSaveIndex) + MFI.setStackID(i, TargetStackID::Default); } MCPhysReg SIMachineFunctionInfo::getNextUserSGPR() const { Index: test/CodeGen/AMDGPU/atomicrmw-nand.ll =================================================================== --- test/CodeGen/AMDGPU/atomicrmw-nand.ll +++ test/CodeGen/AMDGPU/atomicrmw-nand.ll @@ -6,7 +6,7 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: ds_read_b32 v2, v0 -; GCN-NEXT: s_mov_b64 s[6:7], 0 +; GCN-NEXT: s_mov_b64 s[4:5], 0 ; GCN-NEXT: BB0_1: ; %atomicrmw.start ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -18,11 +18,11 @@ ; GCN-NEXT: buffer_wbinvl1_vol ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GCN-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN-NEXT: s_cbranch_execnz BB0_1 ; GCN-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw nand i32 addrspace(3)* %ptr, i32 4 seq_cst @@ -34,23 +34,23 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: global_load_dword v3, v[0:1], off -; GCN-NEXT: s_mov_b64 s[6:7], 0 +; GCN-NEXT: s_mov_b64 s[4:5], 0 ; GCN-NEXT: BB1_1: ; %atomicrmw.start ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_not_b32_e32 v2, v3 ; GCN-NEXT: v_or_b32_e32 v2, -5, v2 -; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc -; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: buffer_wbinvl1_vol ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN-NEXT: v_mov_b32_e32 v3, v2 -; GCN-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GCN-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN-NEXT: s_cbranch_execnz BB1_1 ; GCN-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw nand i32 addrspace(1)* %ptr, i32 4 seq_cst @@ -62,24 +62,24 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: flat_load_dword v3, v[0:1] -; GCN-NEXT: s_mov_b64 s[6:7], 0 +; GCN-NEXT: s_mov_b64 s[4:5], 0 ; GCN-NEXT: BB2_1: ; %atomicrmw.start ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: v_not_b32_e32 v2, v3 ; GCN-NEXT: v_or_b32_e32 v2, -5, v2 -; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: buffer_wbinvl1_vol ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN-NEXT: v_mov_b32_e32 v3, v2 -; GCN-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GCN-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN-NEXT: s_cbranch_execnz BB2_1 ; GCN-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw nand i32* %ptr, i32 4 seq_cst Index: test/CodeGen/AMDGPU/byval-frame-setup.ll =================================================================== --- test/CodeGen/AMDGPU/byval-frame-setup.ll +++ test/CodeGen/AMDGPU/byval-frame-setup.ll @@ -69,28 +69,30 @@ } ; GCN-LABEL: {{^}}void_func_byval_struct_non_leaf: -; GCN: s_mov_b32 s5, s32 -; GCN: s_add_u32 s32, s32, 0xc00{{$}} -; GCN-DAG: buffer_store_dword v32 -; GCN-DAG: buffer_store_dword v33 +; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:36 +; GCN-DAG: v_writelane_b32 v33, s34, +; GCN: s_mov_b32 s34, s32 +; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s34{{$}} +; GCN-DAG: s_add_u32 s32, s32, 0xc00{{$}} +; GCN-DAG: buffer_store_dword v32, off, s[0:3], s34 offset:32 ; GCN-NOT: v_writelane_b32 v{{[0-9]+}}, s32 -; GCN-DAG: v_writelane_b32 -; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5{{$}} + ; GCN-DAG: v_add_{{[iu]}}32_e32 [[ADD0:v[0-9]+]], vcc, 1, [[LOAD0]] -; GCN-DAG: buffer_store_dword [[ADD0]], off, s[0:3], s5{{$}} +; GCN: buffer_store_dword [[ADD0]], off, s[0:3], s34{{$}} -; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:16{{$}} +; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s34 offset:16{{$}} ; GCN-DAG: v_add_{{[iu]}}32_e32 [[ADD1:v[0-9]+]], vcc, 2, [[LOAD1]] ; GCN: s_swappc_b64 -; GCN: buffer_store_dword [[ADD1]], off, s[0:3], s5 offset:16{{$}} +; GCN: buffer_store_dword [[ADD1]], off, s[0:3], s34 offset:16{{$}} ; GCN: v_readlane_b32 ; GCN-NOT: v_readlane_b32 s32 -; GCN-DAG: buffer_load_dword v32, -; GCN-DAG: buffer_load_dword v33, +; GCN-DAG: buffer_load_dword v32, off, s[0:3], s34 offset:32 ; GCN: s_sub_u32 s32, s32, 0xc00{{$}} +; GCN: v_readlane_b32 s34, v33, +; GCN-DAG: buffer_load_dword v33, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload ; GCN: s_setpc_b64 define void @void_func_byval_struct_non_leaf(%struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg0, %struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg1) #1 { entry: @@ -108,20 +110,20 @@ } ; GCN-LABEL: {{^}}call_void_func_byval_struct_func: -; GCN: s_mov_b32 s5, s32 +; GCN: s_mov_b32 s34, s32 ; GCN-DAG: s_add_u32 s32, s32, 0xc00{{$}} ; GCN-DAG: v_writelane_b32 ; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 ; GCN-DAG: v_mov_b32_e32 [[THIRTEEN:v[0-9]+]], 13 -; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s5{{$}} -; GCN-DAG: buffer_store_dword [[THIRTEEN]], off, s[0:3], s5 offset:16 +; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s34{{$}} +; GCN-DAG: buffer_store_dword [[THIRTEEN]], off, s[0:3], s34 offset:16 -; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5{{$}} -; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:4 -; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s5 offset:8 -; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s5 offset:12 +; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s34{{$}} +; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s34 offset:4 +; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s34 offset:8 +; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s34 offset:12 ; GCN-NOT: s_add_u32 s32, s32, 0x800 @@ -131,10 +133,10 @@ ; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:8 ; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:12 -; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s5 offset:16 -; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s5 offset:20 -; GCN: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s5 offset:24 -; GCN: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s5 offset:28 +; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s34 offset:16 +; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s34 offset:20 +; GCN: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s34 offset:24 +; GCN: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s34 offset:28 ; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:16 ; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:20 @@ -149,8 +151,9 @@ ; GCN-NOT: s_sub_u32 s32, s32, 0x800 ; GCN: s_sub_u32 s32, s32, 0xc00{{$}} -; GCN-NEXT: s_waitcnt -; GCN-NEXT: s_setpc_b64 +; GCN: v_readlane_b32 s34, v +; GCN: s_waitcnt +; GCN: s_setpc_b64 define void @call_void_func_byval_struct_func() #1 { entry: %arg0 = alloca %struct.ByValStruct, align 4, addrspace(5) @@ -301,20 +304,20 @@ } ; GCN-LABEL: {{^}}call_void_func_byval_struct_align8_func: -; GCN: s_mov_b32 s5, s32 +; GCN: s_mov_b32 s34, s32 ; GCN-DAG: s_add_u32 s32, s32, 0xc00{{$}} ; GCN-DAG: v_writelane_b32 ; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 ; GCN-DAG: v_mov_b32_e32 [[THIRTEEN:v[0-9]+]], 13 -; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s5{{$}} -; GCN-DAG: buffer_store_dword [[THIRTEEN]], off, s[0:3], s5 offset:16 +; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s34{{$}} +; GCN-DAG: buffer_store_dword [[THIRTEEN]], off, s[0:3], s34 offset:16 -; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5{{$}} -; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:4 -; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s5 offset:8 -; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s5 offset:12 +; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s34{{$}} +; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s34 offset:4 +; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s34 offset:8 +; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s34 offset:12 ; GCN-NOT: s_add_u32 s32, s32, 0x800 @@ -323,10 +326,10 @@ ; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:8 ; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:12 -; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s5 offset:16 -; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s5 offset:20 -; GCN: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s5 offset:24 -; GCN: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s5 offset:28 +; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s34 offset:16 +; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s34 offset:20 +; GCN: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s34 offset:24 +; GCN: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s34 offset:28 ; GCN: s_waitcnt vmcnt(0) ; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:16 ; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:20 @@ -341,7 +344,8 @@ ; GCN-NOT: s_sub_u32 s32, s32, 0x800 ; GCN: s_sub_u32 s32, s32, 0xc00{{$}} -; GCN-NEXT: s_waitcnt +; GCN: v_readlane_b32 s34, v +; GCN: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @call_void_func_byval_struct_align8_func() #0 { entry: @@ -380,7 +384,7 @@ ret void } -declare void @external_void_func_void() #0 +declare hidden void @external_void_func_void() #0 declare void @llvm.lifetime.start.p5i8(i64, i8 addrspace(5)* nocapture) #3 declare void @llvm.lifetime.end.p5i8(i64, i8 addrspace(5)* nocapture) #3 Index: test/CodeGen/AMDGPU/call-graph-register-usage.ll =================================================================== --- test/CodeGen/AMDGPU/call-graph-register-usage.ll +++ test/CodeGen/AMDGPU/call-graph-register-usage.ll @@ -13,14 +13,14 @@ } ; GCN-LABEL: {{^}}indirect_use_vcc: -; GCN: v_writelane_b32 v32, s34, 0 -; GCN: v_writelane_b32 v32, s35, 1 -; GCN: v_writelane_b32 v32, s36, 2 +; GCN: v_writelane_b32 v32, s34, 2 +; GCN: v_writelane_b32 v32, s36, 0 +; GCN: v_writelane_b32 v32, s37, 1 ; GCN: s_swappc_b64 -; GCN: v_readlane_b32 s36, v32, 2 -; GCN: v_readlane_b32 s35, v32, 1 -; GCN: v_readlane_b32 s34, v32, 0 -; GCN: ; NumSgprs: 39 +; GCN: v_readlane_b32 s37, v32, 1 +; GCN: v_readlane_b32 s36, v32, 0 +; GCN: v_readlane_b32 s34, v32, 2 +; GCN: ; NumSgprs: 40 ; GCN: ; NumVgprs: 33 define void @indirect_use_vcc() #1 { call void @use_vcc() @@ -29,8 +29,8 @@ ; GCN-LABEL: {{^}}indirect_2level_use_vcc_kernel: ; GCN: is_dynamic_callstack = 0 -; CI: ; NumSgprs: 41 -; VI-NOBUG: ; NumSgprs: 43 +; CI: ; NumSgprs: 42 +; VI-NOBUG: ; NumSgprs: 44 ; VI-BUG: ; NumSgprs: 96 ; GCN: ; NumVgprs: 33 define amdgpu_kernel void @indirect_2level_use_vcc_kernel(i32 addrspace(1)* %out) #0 { @@ -48,8 +48,8 @@ } ; GCN-LABEL: {{^}}indirect_use_flat_scratch: -; CI: ; NumSgprs: 41 -; VI: ; NumSgprs: 43 +; CI: ; NumSgprs: 42 +; VI: ; NumSgprs: 44 ; GCN: ; NumVgprs: 33 define void @indirect_use_flat_scratch() #1 { call void @use_flat_scratch() @@ -58,8 +58,8 @@ ; GCN-LABEL: {{^}}indirect_2level_use_flat_scratch_kernel: ; GCN: is_dynamic_callstack = 0 -; CI: ; NumSgprs: 41 -; VI-NOBUG: ; NumSgprs: 43 +; CI: ; NumSgprs: 42 +; VI-NOBUG: ; NumSgprs: 44 ; VI-BUG: ; NumSgprs: 96 ; GCN: ; NumVgprs: 33 define amdgpu_kernel void @indirect_2level_use_flat_scratch_kernel(i32 addrspace(1)* %out) #0 { Index: test/CodeGen/AMDGPU/call-preserved-registers.ll =================================================================== --- test/CodeGen/AMDGPU/call-preserved-registers.ll +++ test/CodeGen/AMDGPU/call-preserved-registers.ll @@ -23,25 +23,23 @@ } ; GCN-LABEL: {{^}}test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void: -; GCN: v_writelane_b32 v32, s34, 0 -; GCN: v_writelane_b32 v32, s35, 1 -; GCN: v_writelane_b32 v32, s36, 2 -; GCN: v_writelane_b32 v32, s37, 3 -; GCN: v_writelane_b32 v32, s38, 4 +; GCN: buffer_store_dword +; GCN: v_writelane_b32 v32, s34, 4 +; GCN: v_writelane_b32 v32, s36, 0 +; GCN: v_writelane_b32 v32, s37, 1 +; GCN: v_writelane_b32 v32, s38, 2 -; GCN: s_mov_b32 [[COPY_FP:s[0-9]+]], s5 -; GCN-NEXT: s_swappc_b64 -; GCN-NEXT: s_mov_b32 s5, [[COPY_FP]] -; GCN-NEXT: s_mov_b32 [[COPY_FP]], s5 +; GCN: s_swappc_b64 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: s_swappc_b64 -; GCN-DAG: s_mov_b32 s5, [[COPY_FP]] -; GCN-DAG: v_readlane_b32 s38, v32, 4 -; GCN: v_readlane_b32 s37, v32, 3 -; GCN: v_readlane_b32 s36, v32, 2 -; GCN: v_readlane_b32 s35, v32, 1 -; GCN: v_readlane_b32 s34, v32, 0 +; GCN-DAG: v_readlane_b32 s39, v32, 3 +; GCN-DAG: v_readlane_b32 s38, v32, 2 +; GCN: v_readlane_b32 s37, v32, 1 +; GCN: v_readlane_b32 s36, v32, 0 + +; GCN: v_readlane_b32 s34, v32, 4 +; GCN: buffer_load_dword ; GCN: s_setpc_b64 define void @test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void() #0 { call void @external_void_func_void() @@ -50,14 +48,17 @@ ret void } -; FIXME: Avoid extra restore of FP in between calls. ; GCN-LABEL: {{^}}test_func_call_external_void_funcx2: -; GCN: s_mov_b32 [[COPY_FP:s[0-9]+]], s5 -; GCN-NEXT: s_swappc_b64 -; GCN-NEXT: s_mov_b32 s5, [[COPY_FP]] -; GCN-NEXT: s_mov_b32 [[COPY_FP]], s5 +; GCN: buffer_store_dword v32 +; GCN: v_writelane_b32 v32, s34, 4 + +; GCN: s_mov_b32 s34, s32 +; GCN: s_add_u32 s32, s32, 0x400 +; GCN: s_swappc_b64 ; GCN-NEXT: s_swappc_b64 -; GCN: s_mov_b32 s5, [[COPY_FP]] + +; GCN: v_readlane_b32 s34, v32, 4 +; GCN: buffer_load_dword v32, define void @test_func_call_external_void_funcx2() #0 { call void @external_void_func_void() call void @external_void_func_void() Index: test/CodeGen/AMDGPU/call-waitcnt.ll =================================================================== --- test/CodeGen/AMDGPU/call-waitcnt.ll +++ test/CodeGen/AMDGPU/call-waitcnt.ll @@ -119,13 +119,13 @@ ; GCN-LABEL: tailcall_got_load: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_getpc_b64 s[6:7] -; GCN-NEXT: s_add_u32 s6, s6, got.func@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s7, s7, got.func@gotpcrel32@hi+4 -; GCN-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, got.func@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, got.func@gotpcrel32@hi+4 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 s[6:7] +; GCN-NEXT: s_setpc_b64 s[4:5] tail call void @got.func(i32 0) ret void } @@ -135,11 +135,11 @@ ; GCN-LABEL: tail_call_memory_arg_load: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_getpc_b64 s[6:7] -; GCN-NEXT: s_add_u32 s6, s6, func@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s7, s7, func@rel32@hi+4 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, func@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, func@rel32@hi+4 ; GCN-NEXT: ds_read_b32 v0, v0 -; GCN-NEXT: s_setpc_b64 s[6:7] +; GCN-NEXT: s_setpc_b64 s[4:5] %vgpr = load volatile i32, i32 addrspace(3)* %ptr tail call void @func(i32 %vgpr) ret void Index: test/CodeGen/AMDGPU/callee-frame-setup.ll =================================================================== --- test/CodeGen/AMDGPU/callee-frame-setup.ll +++ test/CodeGen/AMDGPU/callee-frame-setup.ll @@ -12,7 +12,9 @@ ; GCN-LABEL: {{^}}callee_no_stack_no_fp_elim_all: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt -; GCN-NEXT: s_mov_b32 s5, s32 +; GCN-NEXT: s_mov_b32 s4, s34 +; GCN-NEXT: s_mov_b32 s34, s32 +; GCN-NEXT: s_mov_b32 s34, s4 ; GCN-NEXT: s_setpc_b64 define void @callee_no_stack_no_fp_elim_all() #1 { ret void @@ -39,14 +41,18 @@ ret void } +; Can use free call clobbered register to preserve original FP value. + ; GCN-LABEL: {{^}}callee_with_stack_no_fp_elim_all: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt -; GCN-NEXT: s_mov_b32 s5, s32 +; GCN-NEXT: s_mov_b32 s4, s34 +; GCN-NEXT: s_mov_b32 s34, s32 ; GCN-NEXT: s_add_u32 s32, s32, 0x200 ; GCN-NEXT: v_mov_b32_e32 v0, 0{{$}} -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s5 offset:4{{$}} +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s34 offset:4{{$}} ; GCN-NEXT: s_sub_u32 s32, s32, 0x200 +; GCN-NEXT: s_mov_b32 s34, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define void @callee_with_stack_no_fp_elim_all() #1 { @@ -71,25 +77,30 @@ ; GCN-LABEL: {{^}}callee_with_stack_and_call: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt -; GCN: s_mov_b32 s5, s32 +; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} +; GCN-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] +; GCN: v_writelane_b32 [[CSR_VGPR]], s34, 2 +; GCN-DAG: s_mov_b32 s34, s32 ; GCN-DAG: s_add_u32 s32, s32, 0x400{{$}} -; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:4 - -; GCN-DAG: v_writelane_b32 v32, s34, -; GCN-DAG: v_writelane_b32 v32, s35, -; GCN-DAG: v_writelane_b32 v32, s36, -; GCN-DAG: v_mov_b32_e32 v0, 0{{$}} -; GCN-DAG: buffer_store_dword v0, off, s[0:3], s5{{$}} -; GCN-DAG: s_mov_b32 [[COPY_FP:s[0-9]+]], s5 +; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} +; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s36, +; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s37, +; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s34{{$}} ; GCN: s_swappc_b64 -; GCN-DAG: s_mov_b32 s5, [[COPY_FP]] -; GCN-DAG: v_readlane_b32 s34, -; GCN-DAG: v_readlane_b32 s35, -; GCN-DAG: v_readlane_b32 s36, -; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4 -; GCN: s_waitcnt + +; GCN-DAG: v_readlane_b32 s36, [[CSR_VGPR]] +; GCN-DAG: v_readlane_b32 s37, [[CSR_VGPR]] + +; GCN: s_sub_u32 s32, s32, 0x400{{$}} +; GCN-NEXT: v_readlane_b32 s34, [[CSR_VGPR]], 2 +; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} +; GCN-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] +; GCN-NEXT: s_waitcnt vmcnt(0) + ; GCN-NEXT: s_setpc_b64 define void @callee_with_stack_and_call() #0 { %alloca = alloca i32, addrspace(5) @@ -106,48 +117,50 @@ ; GCN-LABEL: {{^}}callee_no_stack_with_call: ; GCN: s_waitcnt -; GCN: s_mov_b32 s5, s32 -; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 ; 4-byte Folded Spill +; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} +; GCN-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN-DAG: v_writelane_b32 v32, s34, 0 -; GCN-DAG: v_writelane_b32 v32, s35, 1 -; GCN-DAG: v_writelane_b32 v32, s36, 2 -; GCN-DAG: s_mov_b32 [[COPY_FP:s[0-9]+]], s5 -; GCN: s_swappc_b64 -; GCN: s_mov_b32 s5, [[COPY_FP]] +; GCN-DAG: s_add_u32 s32, s32, 0x400 +; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s34, [[FP_SPILL_LANE:[0-9]+]] -; GCN-DAG: v_readlane_b32 s34, v32, 0 -; GCN-DAG: v_readlane_b32 s35, v32, 1 -; GCN-DAG: v_readlane_b32 s36, v32, 2 +; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s36, 0 +; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s37, 1 +; GCN: s_swappc_b64 -; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] +; GCN-DAG: v_readlane_b32 s36, v32, 0 +; GCN-DAG: v_readlane_b32 s37, v32, 1 ; GCN: s_sub_u32 s32, s32, 0x400 -; GCN: s_setpc_b64 +; GCN-NEXT: v_readlane_b32 s34, [[CSR_VGPR]], [[FP_SPILL_LANE]] +; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} +; GCN-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 define void @callee_no_stack_with_call() #0 { call void @external_void_func_void() ret void } -declare void @external_void_func_void() #0 +declare hidden void @external_void_func_void() #0 -; Make sure if a CSR vgpr is used for SGPR spilling, it is saved and restored +; Make sure if a CSR vgpr is used for SGPR spilling, it is saved and +; restored. No FP is required. +; ; GCN-LABEL: {{^}}callee_func_sgpr_spill_no_calls: ; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] +; GCN: v_writelane_b32 [[CSR_VGPR]], s +; GCN: v_writelane_b32 [[CSR_VGPR]], s -; GCN: v_writelane_b32 v32 ; GCN: ;;#ASMSTART -; GCN: v_readlane_b32 s{{[0-9]+}}, v32 +; GCN: v_readlane_b32 s{{[0-9]+}}, [[CSR_VGPR]] +; GCN: v_readlane_b32 s{{[0-9]+}}, [[CSR_VGPR]] ; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] - ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 { @@ -188,6 +201,256 @@ ret void } +; TODO: Can the SP inc/deec be remvoed? +; GCN-LABEL: {{^}}callee_with_stack_no_fp_elim_csr_vgpr: +; GCN: s_waitcnt +; GCN-NEXT:s_mov_b32 [[FP_COPY:s[0-9]+]], s34 +; GCN-NEXT: s_mov_b32 s34, s32 +; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 +; GCN-DAG: buffer_store_dword v33, off, s[0:3], s34 ; 4-byte Folded Spill +; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s34 offset:8 + +; GCN: ;;#ASMSTART +; GCN-NEXT: ; clobber v33 +; GCN-NEXT: ;;#ASMEND + +; GCN: buffer_load_dword v33, off, s[0:3], s34 ; 4-byte Folded Reload +; GCN: s_add_u32 s32, s32, 0x300 +; GCN-NEXT: s_sub_u32 s32, s32, 0x300 +; GCN-NEXT: s_mov_b32 s34, s4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define void @callee_with_stack_no_fp_elim_csr_vgpr() #1 { + %alloca = alloca i32, addrspace(5) + store volatile i32 0, i32 addrspace(5)* %alloca + call void asm sideeffect "; clobber v33", "~{v33}"() + ret void +} + +; Use a copy to a free SGPR instead of introducing a second CSR VGPR. +; GCN-LABEL: {{^}}last_lane_vgpr_for_fp_csr: +; GCN: s_waitcnt +; GCN-NEXT: v_writelane_b32 v1, s34, 63 +; GCN-NEXT: s_mov_b32 s34, s32 +; GCN: buffer_store_dword v33, off, s[0:3], s34 ; 4-byte Folded Spill +; GCN-COUNT-63: v_writelane_b32 v1 +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s34 offset:8 +; GCN: ;;#ASMSTART +; GCN-COUNT-63: v_readlane_b32 s{{[0-9]+}}, v1 + +; GCN: s_add_u32 s32, s32, 0x300 +; GCN-NEXT: s_sub_u32 s32, s32, 0x300 +; GCN-NEXT: v_readlane_b32 s34, v1, 63 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define void @last_lane_vgpr_for_fp_csr() #1 { + %alloca = alloca i32, addrspace(5) + store volatile i32 0, i32 addrspace(5)* %alloca + call void asm sideeffect "; clobber v33", "~{v33}"() + call void asm sideeffect "", + "~{s40},~{s41},~{s42},~{s43},~{s44},~{s45},~{s46},~{s47},~{s48},~{s49} + ,~{s50},~{s51},~{s52},~{s53},~{s54},~{s55},~{s56},~{s57},~{s58},~{s59} + ,~{s60},~{s61},~{s62},~{s63},~{s64},~{s65},~{s66},~{s67},~{s68},~{s69} + ,~{s70},~{s71},~{s72},~{s73},~{s74},~{s75},~{s76},~{s77},~{s78},~{s79} + ,~{s80},~{s81},~{s82},~{s83},~{s84},~{s85},~{s86},~{s87},~{s88},~{s89} + ,~{s90},~{s91},~{s92},~{s93},~{s94},~{s95},~{s96},~{s97},~{s98},~{s99} + ,~{s100},~{s101},~{s102}"() #1 + + ret void +} + +; Use a copy to a free SGPR instead of introducing a second CSR VGPR. +; GCN-LABEL: {{^}}no_new_vgpr_for_fp_csr: +; GCN: s_waitcnt +; GCN-NEXT: s_mov_b32 [[FP_COPY:s[0-9]+]], s34 +; GCN-NEXT: s_mov_b32 s34, s32 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s34 ; 4-byte Folded Spill +; GCN-COUNT-64: v_writelane_b32 v1, + +; GCN: buffer_store_dword +; GCN: ;;#ASMSTART +; GCN-COUNT-64: v_readlane_b32 s{{[0-9]+}}, v1 + +; GCN: buffer_load_dword v33, off, s[0:3], s34 ; 4-byte Folded Reload +; GCN: s_add_u32 s32, s32, 0x300 +; GCN-NEXT: s_sub_u32 s32, s32, 0x300 +; GCN-NEXT: s_mov_b32 s34, [[FP_COPY]] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define void @no_new_vgpr_for_fp_csr() #1 { + %alloca = alloca i32, addrspace(5) + store volatile i32 0, i32 addrspace(5)* %alloca + call void asm sideeffect "; clobber v33", "~{v33}"() + call void asm sideeffect "", + "~{s39},~{s40},~{s41},~{s42},~{s43},~{s44},~{s45},~{s46},~{s47},~{s48},~{s49} + ,~{s50},~{s51},~{s52},~{s53},~{s54},~{s55},~{s56},~{s57},~{s58},~{s59} + ,~{s60},~{s61},~{s62},~{s63},~{s64},~{s65},~{s66},~{s67},~{s68},~{s69} + ,~{s70},~{s71},~{s72},~{s73},~{s74},~{s75},~{s76},~{s77},~{s78},~{s79} + ,~{s80},~{s81},~{s82},~{s83},~{s84},~{s85},~{s86},~{s87},~{s88},~{s89} + ,~{s90},~{s91},~{s92},~{s93},~{s94},~{s95},~{s96},~{s97},~{s98},~{s99} + ,~{s100},~{s101},~{s102}"() #1 + + ret void +} + +; GCN-LABEL: {{^}}realign_stack_no_fp_elim: +; GCN: s_waitcnt +; GCN-NEXT: s_add_u32 [[SCRATCH:s[0-9]+]], s32, 0x7ffc0 +; GCN-NEXT: s_mov_b32 s4, s34 +; GCN-NEXT: s_and_b32 s34, [[SCRATCH]], 0xfff80000 +; GCN-NEXT: s_add_u32 s32, s32, 0x100000 +; GCN-NEXT: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 +; GCN-NEXT: buffer_store_dword [[ZERO]], off, s[0:3], s34 +; GCN-NEXT: s_sub_u32 s32, s32, 0x100000 +; GCN-NEXT: s_mov_b32 s34, s4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define void @realign_stack_no_fp_elim() #1 { + %alloca = alloca i32, align 8192, addrspace(5) + store volatile i32 0, i32 addrspace(5)* %alloca + ret void +} + +; GCN-LABEL: {{^}}no_unused_non_csr_sgpr_for_fp: +; GCN: s_waitcnt +; GCN-NEXT: v_writelane_b32 v1, s34, 0 +; GCN-NEXT: s_mov_b32 s34, s32 +; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 +; GCN: buffer_store_dword [[ZERO]], off, s[0:3], s34 offset:4 +; GCN: ;;#ASMSTART +; GCN: s_add_u32 s32, s32, 0x200 +; GCN-NEXT: s_mov_b64 s[30:31], vcc +; GCN-NEXT: s_sub_u32 s32, s32, 0x200 +; GCN-NEXT: v_readlane_b32 s34, v1, 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +define void @no_unused_non_csr_sgpr_for_fp() #1 { + %alloca = alloca i32, addrspace(5) + store volatile i32 0, i32 addrspace(5)* %alloca + + ; Use all clobberable registers, so FP has to spill to a VGPR. + call void asm sideeffect "", + "~{s0},~{s1},~{s2},~{s3},~{s4},~{s5},~{s6},~{s7},~{s8},~{s9} + ,~{s10},~{s11},~{s12},~{s13},~{s14},~{s15},~{s16},~{s17},~{s18},~{s19} + ,~{s20},~{s21},~{s22},~{s23},~{s24},~{s25},~{s26},~{s27},~{s28},~{s29} + ,~{s30},~{s31}"() #0 + + ret void +} + +; Need a new CSR VGPR to satisfy the FP spill. +; GCN-LABEL: {{^}}no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr: +; GCN: s_waitcnt +; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} +; GCN-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] +; GCN-NEXT: v_writelane_b32 v32, s34, 0 +; GCN-NEXT: s_mov_b32 s34, s32 +; GCN: s_add_u32 s32, s32, 0x300{{$}} + +; GCN-DAG: s_mov_b64 vcc, s[30:31] +; GCN-DAG: buffer_store_dword + +; GCN: ;;#ASMSTART +; GCN: s_mov_b64 s[30:31], vcc + +; GCN: s_sub_u32 s32, s32, 0x300{{$}} +; GCN-NEXT: v_readlane_b32 s34, v32, 0 +; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} +; GCN-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 { + %alloca = alloca i32, addrspace(5) + store volatile i32 0, i32 addrspace(5)* %alloca + + ; Use all clobberable registers, so FP has to spill to a VGPR. + call void asm sideeffect "", + "~{s0},~{s1},~{s2},~{s3},~{s4},~{s5},~{s6},~{s7},~{s8},~{s9} + ,~{s10},~{s11},~{s12},~{s13},~{s14},~{s15},~{s16},~{s17},~{s18},~{s19} + ,~{s20},~{s21},~{s22},~{s23},~{s24},~{s25},~{s26},~{s27},~{s28},~{s29} + ,~{s30},~{s31}"() #0 + + call void asm sideeffect "; clobber nonpreserved VGPRs", + "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9} + ,~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19} + ,~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29} + ,~{v30},~{v31}"() #1 + + ret void +} + +; The byval argument exceeds the MUBUF constant offset, so a scratch +; register is needed to access the CSR VGPR slot. +; GCN-LABEL: {{^}}scratch_reg_needed_mubuf_offset: +; GCN: s_waitcnt +; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} +; GCN-NEXT: v_mov_b32_e32 [[SCRATCH_VGPR:v[0-9]+]], 0x1008 +; GCN-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], [[SCRATCH_VGPR]], s[0:3], s32 offen ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] +; GCN-NEXT: v_writelane_b32 v32, s34, 0 +; GCN-NEXT: s_mov_b32 s34, s32 +; GCN-DAG: s_add_u32 s32, s32, 0x40300{{$}} +; GCN-DAG: s_mov_b64 vcc, s[30:31] +; GCN-DAG: buffer_store_dword + +; GCN: ;;#ASMSTART +; GCN: s_mov_b64 s[30:31], vcc + +; GCN: s_sub_u32 s32, s32, 0x40300{{$}} +; GCN-NEXT: v_readlane_b32 s34, v32, 0 +; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} +; GCN-NEXT: v_mov_b32_e32 [[SCRATCH_VGPR:v[0-9]+]], 0x1008 +; GCN-NEXT: buffer_load_dword [[CSR_VGPR]], [[SCRATCH_VGPR]], s[0:3], s32 offen ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define void @scratch_reg_needed_mubuf_offset([4096 x i8] addrspace(5)* byval align 4 %arg) #1 { + %alloca = alloca i32, addrspace(5) + store volatile i32 0, i32 addrspace(5)* %alloca + + ; Use all clobberable registers, so FP has to spill to a VGPR. + call void asm sideeffect "; clobber nonpreserved SGPRs", + "~{s0},~{s1},~{s2},~{s3},~{s4},~{s5},~{s6},~{s7},~{s8},~{s9} + ,~{s10},~{s11},~{s12},~{s13},~{s14},~{s15},~{s16},~{s17},~{s18},~{s19} + ,~{s20},~{s21},~{s22},~{s23},~{s24},~{s25},~{s26},~{s27},~{s28},~{s29} + ,~{s30},~{s31}"() #0 + + ; Use all clobberable VGPRs, so a CSR spill is needed for the VGPR + call void asm sideeffect "; clobber nonpreserved VGPRs", + "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9} + ,~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19} + ,~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29} + ,~{v30},~{v31}"() #1 + + ret void +} + +; GCN-LABEL: {{^}}local_empty_func: +; GCN: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define internal void @local_empty_func() #0 { + ret void +} + +; An FP is needed, despite not needing any spills +; TODO: Ccould see callee does not use stack and omit FP. +; GCN-LABEL: {{^}}ipra_call_with_stack: +; GCN: s_mov_b32 [[FP_COPY:s[0-9]+]], s34 +; GCN: s_mov_b32 s34, s32 +; GCN: s_add_u32 s32, s32, 0x400 +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s34{{$}} +; GCN: s_swappc_b64 +; GCN: s_sub_u32 s32, s32, 0x400 +; GCN: s_mov_b32 s34, [[FP_COPY:s[0-9]+]] +define void @ipra_call_with_stack() #0 { + %alloca = alloca i32, addrspace(5) + store volatile i32 0, i32 addrspace(5)* %alloca + call void @local_empty_func() + ret void +} + attributes #0 = { nounwind } attributes #1 = { nounwind "frame-pointer"="all" } attributes #2 = { nounwind "frame-pointer"="non-leaf" } Index: test/CodeGen/AMDGPU/callee-special-input-sgprs.ll =================================================================== --- test/CodeGen/AMDGPU/callee-special-input-sgprs.ll +++ test/CodeGen/AMDGPU/callee-special-input-sgprs.ll @@ -2,8 +2,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-code-object-v3 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,GFX9 %s ; GCN-LABEL: {{^}}use_dispatch_ptr: -; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s6 -; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s7 +; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s4 +; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s5 ; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} define hidden void @use_dispatch_ptr() #1 { %dispatch_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0 @@ -14,15 +14,17 @@ ; GCN-LABEL: {{^}}kern_indirect_use_dispatch_ptr: ; GCN: enable_sgpr_dispatch_ptr = 1 -; GCN: s_mov_b64 s[6:7], s[4:5] +; GCN-NOT: s[4:5] +; GCN-NOT: s4 +; GCN-NOT: s5 define amdgpu_kernel void @kern_indirect_use_dispatch_ptr(i32) #1 { call void @use_dispatch_ptr() ret void } ; GCN-LABEL: {{^}}use_queue_ptr: -; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s6 -; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s7 +; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s4 +; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s5 ; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} define hidden void @use_queue_ptr() #1 { %queue_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0 @@ -33,8 +35,9 @@ ; GCN-LABEL: {{^}}kern_indirect_use_queue_ptr: ; GCN: enable_sgpr_queue_ptr = 1 -; GCN: s_mov_b64 s[6:7], s[4:5] -; GCN: s_swappc_b64 +; GCN-NOT: s[4:5] +; GCN-NOT: s4 +; GCN-NOT: s5 define amdgpu_kernel void @kern_indirect_use_queue_ptr(i32) #1 { call void @use_queue_ptr() ret void @@ -55,18 +58,17 @@ ; GCN-LABEL: {{^}}kern_indirect_use_queue_ptr_addrspacecast: ; CIVI: enable_sgpr_queue_ptr = 1 - -; CIVI: s_mov_b64 s[6:7], s[4:5] -; GFX9-NOT: s_mov_b64 -; GCN: s_swappc_b64 +; CIVI-NOT: s[4:5] +; CIVI-NOT: s4 +; CIVI-NOT: s5 define amdgpu_kernel void @kern_indirect_use_queue_ptr_addrspacecast(i32) #1 { call void @use_queue_ptr_addrspacecast() ret void } ; GCN-LABEL: {{^}}use_kernarg_segment_ptr: -; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s6 -; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s7 +; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s4 +; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s5 ; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} define hidden void @use_kernarg_segment_ptr() #1 { %kernarg_segment_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0 @@ -77,7 +79,9 @@ ; GCN-LABEL: {{^}}kern_indirect_use_kernarg_segment_ptr: ; GCN: enable_sgpr_kernarg_segment_ptr = 1 -; GCN: s_mov_b64 s[6:7], s[4:5] +; GCN-NOT: s[4:5] +; GCN-NOT: s4 +; GCN-NOT: s5 ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_kernarg_segment_ptr(i32) #1 { call void @use_kernarg_segment_ptr() @@ -85,7 +89,7 @@ } ; GCN-LABEL: {{^}}use_dispatch_id: -; GCN: ; use s[6:7] +; GCN: ; use s[4:5] define hidden void @use_dispatch_id() #1 { %id = call i64 @llvm.amdgcn.dispatch.id() call void asm sideeffect "; use $0", "s"(i64 %id) @@ -97,8 +101,9 @@ ; GCN-LABEL: {{^}}kern_indirect_use_dispatch_id: ; GCN: enable_sgpr_dispatch_id = 1 - -; GCN: s_mov_b64 s[6:7], s[4:5] +; GCN-NOT: s[4:5] +; GCN-NOT: s4 +; GCN-NOT: s5 define amdgpu_kernel void @kern_indirect_use_dispatch_id() #1 { call void @use_dispatch_id() ret void @@ -147,7 +152,7 @@ ; GCN-LABEL: {{^}}use_workgroup_id_xy: ; GCN: ; use s4 -; GCN: ; use s6 +; GCN: ; use s5 define hidden void @use_workgroup_id_xy() #1 { %val0 = call i32 @llvm.amdgcn.workgroup.id.x() %val1 = call i32 @llvm.amdgcn.workgroup.id.y() @@ -158,8 +163,8 @@ ; GCN-LABEL: {{^}}use_workgroup_id_xyz: ; GCN: ; use s4 +; GCN: ; use s5 ; GCN: ; use s6 -; GCN: ; use s7 define hidden void @use_workgroup_id_xyz() #1 { %val0 = call i32 @llvm.amdgcn.workgroup.id.x() %val1 = call i32 @llvm.amdgcn.workgroup.id.y() @@ -172,7 +177,7 @@ ; GCN-LABEL: {{^}}use_workgroup_id_xz: ; GCN: ; use s4 -; GCN: ; use s6 +; GCN: ; use s5 define hidden void @use_workgroup_id_xz() #1 { %val0 = call i32 @llvm.amdgcn.workgroup.id.x() %val1 = call i32 @llvm.amdgcn.workgroup.id.z() @@ -183,7 +188,7 @@ ; GCN-LABEL: {{^}}use_workgroup_id_yz: ; GCN: ; use s4 -; GCN: ; use s6 +; GCN: ; use s5 define hidden void @use_workgroup_id_yz() #1 { %val0 = call i32 @llvm.amdgcn.workgroup.id.y() %val1 = call i32 @llvm.amdgcn.workgroup.id.z() @@ -244,8 +249,8 @@ ; GCN: s_mov_b32 s33, s8 +; GCN: s_mov_b32 s5, s7 ; GCN: s_mov_b32 s4, s6 -; GCN: s_mov_b32 s6, s7 ; GCN: s_mov_b32 s32, s33 ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_workgroup_id_xy() #1 { @@ -261,8 +266,8 @@ ; GCN: s_mov_b32 s33, s9 ; GCN: s_mov_b32 s4, s6 -; GCN: s_mov_b32 s6, s7 -; GCN: s_mov_b32 s7, s8 +; GCN: s_mov_b32 s5, s7 +; GCN: s_mov_b32 s6, s8 ; GCN: s_mov_b32 s32, s33 ; GCN: s_swappc_b64 @@ -277,8 +282,8 @@ ; GCN: enable_sgpr_workgroup_id_z = 1 ; GCN: s_mov_b32 s33, s8 +; GCN: s_mov_b32 s5, s7 ; GCN: s_mov_b32 s4, s6 -; GCN: s_mov_b32 s6, s7 ; GCN: s_mov_b32 s32, s33 @@ -294,8 +299,8 @@ ; GCN: enable_sgpr_workgroup_id_z = 1 ; GCN: s_mov_b32 s33, s9 -; GCN: s_mov_b32 s6, s8 ; GCN: s_mov_b32 s4, s7 +; GCN: s_mov_b32 s5, s8 ; GCN: s_mov_b32 s32, s33 ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_workgroup_id_yz() #1 { @@ -404,19 +409,19 @@ ; GCN-LABEL: {{^}}use_every_sgpr_input: ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32{{$}} +; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s4 +; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s5 +; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s6 ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s7 ; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s8 ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s9 ; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} -; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s10 -; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s11 -; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} -; GCN: ; use s[12:13] -; GCN: ; use s4 +; GCN: ; use s[10:11] +; GCN: ; use s12 +; GCN: ; use s13 ; GCN: ; use s14 -; GCN: ; use s15 define hidden void @use_every_sgpr_input() #1 { %alloca = alloca i32, align 4, addrspace(5) store volatile i32 0, i32 addrspace(5)* %alloca @@ -462,13 +467,9 @@ ; GCN: enable_sgpr_flat_scratch_init = 1 ; GCN: s_mov_b32 s33, s17 -; GCN: s_mov_b64 s[12:13], s[10:11] -; GCN: s_mov_b64 s[10:11], s[8:9] -; GCN: s_mov_b64 s[8:9], s[6:7] -; GCN: s_mov_b64 s[6:7], s[4:5] -; GCN: s_mov_b32 s4, s14 -; GCN: s_mov_b32 s14, s15 -; GCN: s_mov_b32 s15, s16 +; GCN: s_mov_b32 s12, s14 +; GCN: s_mov_b32 s13, s15 +; GCN: s_mov_b32 s14, s16 ; GCN: s_mov_b32 s32, s33 ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_every_sgpr_input() #1 { @@ -489,17 +490,20 @@ ; GCN-NOT: s[8:9] ; GCN-NOT: s[10:11] ; GCN-NOT: s[12:13] -; GCN: s_or_saveexec_b64 s[6:7], -1 +; GCN: s_or_saveexec_b64 s[4:5], -1 define hidden void @func_indirect_use_every_sgpr_input() #1 { call void @use_every_sgpr_input() ret void } ; GCN-LABEL: {{^}}func_use_every_sgpr_input_call_use_workgroup_id_xyz: -; GCN-NOT: s_mov_b32 s4 +; GCN: s_mov_b32 s4, s12 +; GCN: s_mov_b32 s5, s13 ; GCN: s_mov_b32 s6, s14 -; GCN-NEXT: s_mov_b32 s7, s15 -; GCN-NOT: s_mov_b32 s4 +; GCN: ; use s[10:11] +; GCN: ; use s12 +; GCN: ; use s13 +; GCN: ; use s14 ; GCN: s_swappc_b64 define hidden void @func_use_every_sgpr_input_call_use_workgroup_id_xyz() #1 { @@ -535,26 +539,27 @@ } ; GCN-LABEL: {{^}}func_use_every_sgpr_input_call_use_workgroup_id_xyz_spill: -; GCN: s_mov_b32 s5, s32 - +; GCN-DAG: s_mov_b32 s34, s32 ; GCN-DAG: s_add_u32 s32, s32, 0x400 +; GCN-DAG: s_mov_b64 s{{\[}}[[LO_X:[0-9]+]]{{\:}}[[HI_X:[0-9]+]]{{\]}}, s[4:5] +; GCN-DAG: s_mov_b64 s{{\[}}[[LO_Y:[0-9]+]]{{\:}}[[HI_Y:[0-9]+]]{{\]}}, s[6:7] + + +; GCN: s_mov_b32 s4, s12 +; GCN: s_mov_b32 s5, s13 +; GCN: s_mov_b32 s6, s14 -; GCN: s_mov_b64 s{{\[}}[[LO_X:[0-9]+]]{{\:}}[[HI_X:[0-9]+]]{{\]}}, s[6:7] -; GCN-NOT: s_mov_b32 s4, -; GCN-DAG: s_mov_b32 s6, s14 -; GCN-DAG: s_mov_b32 s7, s15 +; GCN: s_mov_b64 s{{\[}}[[LO_Z:[0-9]+]]{{\:}}[[HI_Z:[0-9]+]]{{\]}}, s[8:9] -; GCN: s_mov_b64 s{{\[}}[[LO_Y:[0-9]+]]{{\:}}[[HI_Y:[0-9]+]]{{\]}}, s[8:9] +; GCN-DAG: s_mov_b32 [[SAVE_X:s[0-57-9][0-9]*]], s12 +; GCN-DAG: s_mov_b32 [[SAVE_Y:s[0-57-9][0-9]*]], s13 +; GCN-DAG: s_mov_b32 [[SAVE_Z:s[0-68-9][0-9]*]], s14 -; GCN-DAG: s_mov_b32 [[SAVE_X:s[0-57-9][0-9]*]], s4 -; GCN-DAG: s_mov_b32 [[SAVE_Y:s[0-57-9][0-9]*]], s14 -; GCN-DAG: s_mov_b32 [[SAVE_Z:s[0-68-9][0-9]*]], s15 -; GCN: s_mov_b64 s{{\[}}[[LO_Z:[0-9]+]]{{\:}}[[HI_Z:[0-9]+]]{{\]}}, s[10:11] ; GCN: s_swappc_b64 -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s34{{$}} ; GCN-DAG: v_mov_b32_e32 v[[LO1:[0-9]+]], s[[LO_X]] ; GCN-DAG: v_mov_b32_e32 v[[HI1:[0-9]+]], s[[HI_X]] ; GCN-DAG: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO1]]:[[HI1]]{{\]}} Index: test/CodeGen/AMDGPU/callee-special-input-vgprs.ll =================================================================== --- test/CodeGen/AMDGPU/callee-special-input-vgprs.ll +++ test/CodeGen/AMDGPU/callee-special-input-vgprs.ll @@ -305,7 +305,7 @@ } ; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x: -; GCN: s_mov_b32 s5, s32 +; GCN: s_mov_b32 s34, s32 ; GCN: buffer_store_dword v1, off, s[0:3], s32{{$}} ; GCN: s_swappc_b64 define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 { @@ -324,15 +324,15 @@ ; Requires loading and storing to stack slot. ; GCN-LABEL: {{^}}too_many_args_call_too_many_args_use_workitem_id_x: -; GCN: s_add_u32 s32, s32, 0x400{{$}} -; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill -; GCN: buffer_load_dword v32, off, s[0:3], s5{{$}} +; GCN-DAG: s_add_u32 s32, s32, 0x400{{$}} +; GCN-DAG: buffer_store_dword v32, off, s[0:3], s34 offset:4 ; 4-byte Folded Spill +; GCN: buffer_load_dword v32, off, s[0:3], s34{{$}} ; GCN: buffer_store_dword v32, off, s[0:3], s32{{$}} ; GCN: s_swappc_b64 -; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload +; GCN: buffer_load_dword v32, off, s[0:3], s34 offset:4 ; 4-byte Folded Reload ; GCN: s_sub_u32 s32, s32, 0x400{{$}} ; GCN: s_setpc_b64 define void @too_many_args_call_too_many_args_use_workitem_id_x( @@ -445,10 +445,10 @@ ; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x_byval: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}} -; GCN: buffer_store_dword [[K]], off, s[0:3], s5{{$}} +; GCN: buffer_store_dword [[K]], off, s[0:3], s34{{$}} ; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:4 -; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s5{{$}} +; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s34{{$}} ; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32{{$}} ; GCN: v_mov_b32_e32 [[RELOAD_BYVAL]], ; GCN: s_swappc_b64 Index: test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll =================================================================== --- test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll +++ test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll @@ -27,29 +27,27 @@ ; GCN-LABEL: call_split_type_used_outside_block_v2f32: ; GCN: ; %bb.0: ; %bb0 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s5, s32 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_writelane_b32 v32, s34, 2 +; GCN-NEXT: v_writelane_b32 v32, s36, 0 +; GCN-NEXT: s_mov_b32 s34, s32 ; GCN-NEXT: s_add_u32 s32, s32, 0x400 -; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: v_writelane_b32 v32, s34, 0 -; GCN-NEXT: v_writelane_b32 v32, s35, 1 -; GCN-NEXT: v_writelane_b32 v32, s36, 2 -; GCN-NEXT: s_getpc_b64 s[6:7] -; GCN-NEXT: s_add_u32 s6, s6, func_v2f32@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s7, s7, func_v2f32@rel32@hi+4 -; GCN-NEXT: s_mov_b64 s[34:35], s[30:31] -; GCN-NEXT: s_mov_b32 s36, s5 -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GCN-NEXT: s_mov_b32 s5, s36 -; GCN-NEXT: s_mov_b64 s[30:31], s[34:35] -; GCN-NEXT: v_readlane_b32 s36, v32, 2 -; GCN-NEXT: v_readlane_b32 s35, v32, 1 -; GCN-NEXT: v_readlane_b32 s34, v32, 0 -; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: v_writelane_b32 v32, s37, 1 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, func_v2f32@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, func_v2f32@rel32@hi+4 +; GCN-NEXT: s_mov_b64 s[36:37], s[30:31] +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_mov_b64 s[30:31], s[36:37] +; GCN-NEXT: v_readlane_b32 s37, v32, 1 +; GCN-NEXT: v_readlane_b32 s36, v32, 0 ; GCN-NEXT: s_sub_u32 s32, s32, 0x400 +; GCN-NEXT: v_readlane_b32 s34, v32, 2 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] bb0: @@ -65,29 +63,27 @@ ; GCN-LABEL: call_split_type_used_outside_block_v3f32: ; GCN: ; %bb.0: ; %bb0 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s5, s32 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_writelane_b32 v32, s34, 2 +; GCN-NEXT: v_writelane_b32 v32, s36, 0 +; GCN-NEXT: s_mov_b32 s34, s32 ; GCN-NEXT: s_add_u32 s32, s32, 0x400 -; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: v_writelane_b32 v32, s34, 0 -; GCN-NEXT: v_writelane_b32 v32, s35, 1 -; GCN-NEXT: v_writelane_b32 v32, s36, 2 -; GCN-NEXT: s_getpc_b64 s[6:7] -; GCN-NEXT: s_add_u32 s6, s6, func_v3f32@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s7, s7, func_v3f32@rel32@hi+4 -; GCN-NEXT: s_mov_b64 s[34:35], s[30:31] -; GCN-NEXT: s_mov_b32 s36, s5 -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GCN-NEXT: s_mov_b32 s5, s36 -; GCN-NEXT: s_mov_b64 s[30:31], s[34:35] -; GCN-NEXT: v_readlane_b32 s36, v32, 2 -; GCN-NEXT: v_readlane_b32 s35, v32, 1 -; GCN-NEXT: v_readlane_b32 s34, v32, 0 -; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: v_writelane_b32 v32, s37, 1 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, func_v3f32@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, func_v3f32@rel32@hi+4 +; GCN-NEXT: s_mov_b64 s[36:37], s[30:31] +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_mov_b64 s[30:31], s[36:37] +; GCN-NEXT: v_readlane_b32 s37, v32, 1 +; GCN-NEXT: v_readlane_b32 s36, v32, 0 ; GCN-NEXT: s_sub_u32 s32, s32, 0x400 +; GCN-NEXT: v_readlane_b32 s34, v32, 2 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] bb0: @@ -103,29 +99,27 @@ ; GCN-LABEL: call_split_type_used_outside_block_v4f16: ; GCN: ; %bb.0: ; %bb0 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s5, s32 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_writelane_b32 v32, s34, 2 +; GCN-NEXT: v_writelane_b32 v32, s36, 0 +; GCN-NEXT: s_mov_b32 s34, s32 ; GCN-NEXT: s_add_u32 s32, s32, 0x400 -; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: v_writelane_b32 v32, s34, 0 -; GCN-NEXT: v_writelane_b32 v32, s35, 1 -; GCN-NEXT: v_writelane_b32 v32, s36, 2 -; GCN-NEXT: s_getpc_b64 s[6:7] -; GCN-NEXT: s_add_u32 s6, s6, func_v4f16@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s7, s7, func_v4f16@rel32@hi+4 -; GCN-NEXT: s_mov_b64 s[34:35], s[30:31] -; GCN-NEXT: s_mov_b32 s36, s5 -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GCN-NEXT: s_mov_b32 s5, s36 -; GCN-NEXT: s_mov_b64 s[30:31], s[34:35] -; GCN-NEXT: v_readlane_b32 s36, v32, 2 -; GCN-NEXT: v_readlane_b32 s35, v32, 1 -; GCN-NEXT: v_readlane_b32 s34, v32, 0 -; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: v_writelane_b32 v32, s37, 1 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, func_v4f16@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, func_v4f16@rel32@hi+4 +; GCN-NEXT: s_mov_b64 s[36:37], s[30:31] +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_mov_b64 s[30:31], s[36:37] +; GCN-NEXT: v_readlane_b32 s37, v32, 1 +; GCN-NEXT: v_readlane_b32 s36, v32, 0 ; GCN-NEXT: s_sub_u32 s32, s32, 0x400 +; GCN-NEXT: v_readlane_b32 s34, v32, 2 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] bb0: @@ -141,30 +135,28 @@ ; GCN-LABEL: call_split_type_used_outside_block_struct: ; GCN: ; %bb.0: ; %bb0 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s5, s32 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_writelane_b32 v32, s34, 2 +; GCN-NEXT: v_writelane_b32 v32, s36, 0 +; GCN-NEXT: s_mov_b32 s34, s32 ; GCN-NEXT: s_add_u32 s32, s32, 0x400 -; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: v_writelane_b32 v32, s34, 0 -; GCN-NEXT: v_writelane_b32 v32, s35, 1 -; GCN-NEXT: v_writelane_b32 v32, s36, 2 -; GCN-NEXT: s_getpc_b64 s[6:7] -; GCN-NEXT: s_add_u32 s6, s6, func_struct@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s7, s7, func_struct@rel32@hi+4 -; GCN-NEXT: s_mov_b64 s[34:35], s[30:31] -; GCN-NEXT: s_mov_b32 s36, s5 -; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] -; GCN-NEXT: s_mov_b32 s5, s36 +; GCN-NEXT: v_writelane_b32 v32, s37, 1 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, func_struct@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, func_struct@rel32@hi+4 +; GCN-NEXT: s_mov_b64 s[36:37], s[30:31] +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_mov_b64 s[30:31], s[36:37] +; GCN-NEXT: v_readlane_b32 s37, v32, 1 +; GCN-NEXT: v_readlane_b32 s36, v32, 0 ; GCN-NEXT: v_mov_b32_e32 v1, v4 -; GCN-NEXT: s_mov_b64 s[30:31], s[34:35] -; GCN-NEXT: v_readlane_b32 s36, v32, 2 -; GCN-NEXT: v_readlane_b32 s35, v32, 1 -; GCN-NEXT: v_readlane_b32 s34, v32, 0 -; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_sub_u32 s32, s32, 0x400 +; GCN-NEXT: v_readlane_b32 s34, v32, 2 +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] bb0: Index: test/CodeGen/AMDGPU/frame-index-elimination.ll =================================================================== --- test/CodeGen/AMDGPU/frame-index-elimination.ll +++ test/CodeGen/AMDGPU/frame-index-elimination.ll @@ -29,7 +29,7 @@ ; CI: s_sub_u32 [[SUB:s[0-9]+]], s32, s33 ; CI-NEXT: v_lshr_b32_e64 [[SCALED:v[0-9]+]], [[SUB]], 6 -; CI-NEXT: v_add_i32_e64 v1, s[6:7], 4, [[SCALED]] +; CI-NEXT: v_add_i32_e64 v1, s{{\[[0-9]+:[0-9]+\]}}, 4, [[SCALED]] ; CI-NOT: v_mov ; CI: ds_write_b32 v0, v0 ; CI-NEXT: ds_write_b32 v0, v1 @@ -181,7 +181,7 @@ ; GCN-DAG: s_movk_i32 [[K:s[0-9]+]], 0x200 ; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], [[SUB]], 6 -; CI: v_add_i32_e64 [[VZ:v[0-9]+]], s[6:7], [[K]], [[SCALED]] +; CI: v_add_i32_e64 [[VZ:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, [[K]], [[SCALED]] ; GFX9-DAG: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, [[SUB]] ; GFX9: v_add_u32_e32 [[VZ:v[0-9]+]], [[K]], [[SCALED]] @@ -233,10 +233,10 @@ ; GCN-LABEL: {{^}}undefined_stack_store_reg: ; GCN: s_and_saveexec_b64 -; GCN: buffer_store_dword v0, off, s[0:3], s5 offset: -; GCN: buffer_store_dword v0, off, s[0:3], s5 offset: -; GCN: buffer_store_dword v0, off, s[0:3], s5 offset: -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset: +; GCN: buffer_store_dword v0, off, s[0:3], s34 offset: +; GCN: buffer_store_dword v0, off, s[0:3], s34 offset: +; GCN: buffer_store_dword v0, off, s[0:3], s34 offset: +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s34 offset: define void @undefined_stack_store_reg(float %arg, i32 %arg1) #0 { bb: %tmp = alloca <4 x float>, align 16, addrspace(5) Index: test/CodeGen/AMDGPU/function-returns.ll =================================================================== --- test/CodeGen/AMDGPU/function-returns.ll +++ test/CodeGen/AMDGPU/function-returns.ll @@ -104,7 +104,7 @@ } ; GCN-LABEL: {{^}}f32_func_void: -; GCN: buffer_load_dword v0, off, s[8:11], 0 +; GCN: buffer_load_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define float @f32_func_void() #0 { Index: test/CodeGen/AMDGPU/ipra.ll =================================================================== --- test/CodeGen/AMDGPU/ipra.ll +++ test/CodeGen/AMDGPU/ipra.ll @@ -63,10 +63,10 @@ ; GCN-LABEL: {{^}}func_tail_call: ; GCN: s_waitcnt -; GCN-NEXT: s_getpc_b64 s[6:7] -; GCN-NEXT: s_add_u32 s6, -; GCN-NEXT: s_addc_u32 s7, -; GCN-NEXT: s_setpc_b64 s[6:7] +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, +; GCN-NEXT: s_addc_u32 s5, +; GCN-NEXT: s_setpc_b64 s[4:5] ; GCN: ; NumSgprs: 32 ; GCN: ; NumVgprs: 8 Index: test/CodeGen/AMDGPU/llvm.amdgcn.class.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.class.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.class.ll @@ -509,8 +509,8 @@ ; SI-LABEL: {{^}}test_fold_and_ord: ; SI: s_waitcnt -; SI-NEXT: v_cmp_class_f32_e64 s[6:7], v0, 32{{$}} -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7] +; SI-NEXT: v_cmp_class_f32_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v0, 32{{$}} +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, [[COND]] ; SI-NEXT: s_setpc_b64 define i1 @test_fold_and_ord(float %a) { %class = call i1 @llvm.amdgcn.class.f32(float %a, i32 35) #1 @@ -521,8 +521,8 @@ ; SI-LABEL: {{^}}test_fold_and_unord: ; SI: s_waitcnt -; SI-NEXT: v_cmp_class_f32_e64 s[6:7], v0, 3{{$}} -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7] +; SI-NEXT: v_cmp_class_f32_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v0, 3{{$}} +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, [[COND]] ; SI-NEXT: s_setpc_b64 define i1 @test_fold_and_unord(float %a) { %class = call i1 @llvm.amdgcn.class.f32(float %a, i32 35) #1 Index: test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll @@ -59,11 +59,11 @@ ; GCN-LABEL: {{^}}func_implicitarg_ptr: ; GCN: s_waitcnt -; MESA: v_mov_b32_e32 v0, s6 -; MESA: v_mov_b32_e32 v1, s7 +; MESA: v_mov_b32_e32 v0, s4 +; MESA: v_mov_b32_e32 v1, s5 ; MESA: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; HSA: v_mov_b32_e32 v0, s6 -; HSA: v_mov_b32_e32 v1, s7 +; HSA: v_mov_b32_e32 v0, s4 +; HSA: v_mov_b32_e32 v1, s5 ; HSA: flat_load_dword v0, v[0:1] ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -76,11 +76,11 @@ ; GCN-LABEL: {{^}}opencl_func_implicitarg_ptr: ; GCN: s_waitcnt -; MESA: v_mov_b32_e32 v0, s6 -; MESA: v_mov_b32_e32 v1, s7 +; MESA: v_mov_b32_e32 v0, s4 +; MESA: v_mov_b32_e32 v1, s5 ; MESA: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; HSA: v_mov_b32_e32 v0, s6 -; HSA: v_mov_b32_e32 v1, s7 +; HSA: v_mov_b32_e32 v0, s4 +; HSA: v_mov_b32_e32 v1, s5 ; HSA: flat_load_dword v0, v[0:1] ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -95,7 +95,9 @@ ; GCN: enable_sgpr_kernarg_segment_ptr = 1 ; HSA: kernarg_segment_byte_size = 0 ; MESA: kernarg_segment_byte_size = 16 -; GCN: s_mov_b64 s[6:7], s[4:5] +; GCN-NOT: s[4:5] +; GCN-NOT: s4 +; GCN-NOT: s5 ; GCN: s_swappc_b64 define amdgpu_kernel void @kernel_call_implicitarg_ptr_func_empty() #0 { call void @func_implicitarg_ptr() @@ -106,7 +108,9 @@ ; GCN: enable_sgpr_kernarg_segment_ptr = 1 ; HSA: kernarg_segment_byte_size = 48 ; MESA: kernarg_segment_byte_size = 16 -; GCN: s_mov_b64 s[6:7], s[4:5] +; GCN-NOT: s[4:5] +; GCN-NOT: s4 +; GCN-NOT: s5 ; GCN: s_swappc_b64 define amdgpu_kernel void @opencl_kernel_call_implicitarg_ptr_func_empty() #1 { call void @func_implicitarg_ptr() @@ -118,10 +122,10 @@ ; HSA: kernarg_segment_byte_size = 112 ; MESA: kernarg_segment_byte_size = 128 -; HSA: s_add_u32 s6, s4, 0x70 -; MESA: s_add_u32 s6, s4, 0x70 +; HSA: s_add_u32 s4, s4, 0x70 +; MESA: s_add_u32 s4, s4, 0x70 -; GCN: s_addc_u32 s7, s5, 0{{$}} +; GCN: s_addc_u32 s5, s5, 0{{$}} ; GCN: s_swappc_b64 define amdgpu_kernel void @kernel_call_implicitarg_ptr_func([112 x i8]) #0 { call void @func_implicitarg_ptr() @@ -133,9 +137,8 @@ ; HSA: kernarg_segment_byte_size = 160 ; MESA: kernarg_segment_byte_size = 128 -; GCN: s_add_u32 s6, s4, 0x70 - -; GCN: s_addc_u32 s7, s5, 0{{$}} +; GCN: s_add_u32 s4, s4, 0x70 +; GCN: s_addc_u32 s5, s5, 0{{$}} ; GCN: s_swappc_b64 define amdgpu_kernel void @opencl_kernel_call_implicitarg_ptr_func([112 x i8]) #1 { call void @func_implicitarg_ptr() @@ -143,18 +146,18 @@ } ; GCN-LABEL: {{^}}func_call_implicitarg_ptr_func: -; GCN-NOT: s6 -; GCN-NOT: s7 -; GCN-NOT: s[6:7] +; GCN-NOT: s4 +; GCN-NOT: s5 +; GCN-NOT: s[4:5] define void @func_call_implicitarg_ptr_func() #0 { call void @func_implicitarg_ptr() ret void } ; GCN-LABEL: {{^}}opencl_func_call_implicitarg_ptr_func: -; GCN-NOT: s6 -; GCN-NOT: s7 -; GCN-NOT: s[6:7] +; GCN-NOT: s4 +; GCN-NOT: s5 +; GCN-NOT: s[4:5] define void @opencl_func_call_implicitarg_ptr_func() #0 { call void @func_implicitarg_ptr() ret void @@ -162,19 +165,19 @@ ; GCN-LABEL: {{^}}func_kernarg_implicitarg_ptr: ; GCN: s_waitcnt -; MESA-DAG: v_mov_b32_e32 v0, s6 -; MESA-DAG: v_mov_b32_e32 v1, s7 -; MESA-DAG: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 -; MESA: v_mov_b32_e32 v0, s8 -; MESA: v_mov_b32_e32 v1, s9 -; MESA: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 +; MESA-DAG: v_mov_b32_e32 v0, s4 +; MESA-DAG: v_mov_b32_e32 v1, s5 +; MESA-DAG: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; MESA: v_mov_b32_e32 v0, s6 +; MESA: v_mov_b32_e32 v1, s7 +; MESA: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; HSA: v_mov_b32_e32 v0, s4 +; HSA: v_mov_b32_e32 v1, s5 +; HSA: flat_load_dword v0, v[0:1] ; HSA: v_mov_b32_e32 v0, s6 ; HSA: v_mov_b32_e32 v1, s7 ; HSA: flat_load_dword v0, v[0:1] -; HSA: v_mov_b32_e32 v0, s8 -; HSA: v_mov_b32_e32 v1, s9 -; HSA: flat_load_dword v0, v[0:1] ; GCN: s_waitcnt vmcnt(0) define void @func_kernarg_implicitarg_ptr() #0 { @@ -189,20 +192,20 @@ ; GCN-LABEL: {{^}}opencl_func_kernarg_implicitarg_ptr: ; GCN: s_waitcnt +; MESA-DAG: v_mov_b32_e32 v0, s4 +; MESA-DAG: v_mov_b32_e32 v1, s5 +; MESA: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; MESA-DAG: v_mov_b32_e32 v0, s6 ; MESA-DAG: v_mov_b32_e32 v1, s7 -; MESA: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 -; MESA-DAG: v_mov_b32_e32 v0, s8 -; MESA-DAG: v_mov_b32_e32 v1, s9 -; MESA: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 +; MESA: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; HSA: v_mov_b32_e32 v0, s6 -; HSA: v_mov_b32_e32 v1, s7 +; HSA: v_mov_b32_e32 v0, s4 +; HSA: v_mov_b32_e32 v1, s5 ; HSA: flat_load_dword v0, v[0:1] -; HSA: v_mov_b32_e32 v0, s8 -; HSA: v_mov_b32_e32 v1, s9 +; HSA: v_mov_b32_e32 v0, s6 +; HSA: v_mov_b32_e32 v1, s7 ; HSA: flat_load_dword v0, v[0:1] ; GCN: s_waitcnt vmcnt(0) @@ -217,10 +220,8 @@ } ; GCN-LABEL: {{^}}kernel_call_kernarg_implicitarg_ptr_func: -; GCN: s_add_u32 s8, s4, 0x70 -; GCN: s_addc_u32 s9, s5, 0 - -; GCN: s_mov_b64 s[6:7], s[4:5] +; GCN: s_add_u32 s6, s4, 0x70 +; GCN: s_addc_u32 s7, s5, 0 ; GCN: s_swappc_b64 define amdgpu_kernel void @kernel_call_kernarg_implicitarg_ptr_func([112 x i8]) #0 { call void @func_kernarg_implicitarg_ptr() Index: test/CodeGen/AMDGPU/mad_64_32.ll =================================================================== --- test/CodeGen/AMDGPU/mad_64_32.ll +++ test/CodeGen/AMDGPU/mad_64_32.ll @@ -2,7 +2,7 @@ ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s ; GCN-LABEL: {{^}}mad_i64_i32_sextops: -; CI: v_mad_i64_i32 v[0:1], s[6:7], v0, v1, v[2:3] +; CI: v_mad_i64_i32 v[0:1], s{{\[[0-9]+:[0-9]+\]}}, v0, v1, v[2:3] ; SI: v_mul_lo_u32 ; SI: v_mul_hi_i32 @@ -17,7 +17,7 @@ } ; GCN-LABEL: {{^}}mad_i64_i32_sextops_commute: -; CI: v_mad_i64_i32 v[0:1], s[6:7], v0, v1, v[2:3] +; CI: v_mad_i64_i32 v[0:1], s{{\[[0-9]+:[0-9]+\]}}, v0, v1, v[2:3] ; SI-DAG: v_mul_lo_u32 ; SI-DAG: v_mul_hi_i32 @@ -32,7 +32,7 @@ } ; GCN-LABEL: {{^}}mad_u64_u32_zextops: -; CI: v_mad_u64_u32 v[0:1], s[6:7], v0, v1, v[2:3] +; CI: v_mad_u64_u32 v[0:1], s{{\[[0-9]+:[0-9]+\]}}, v0, v1, v[2:3] ; SI-DAG: v_mul_lo_u32 ; SI-DAG: v_mul_hi_u32 @@ -47,7 +47,7 @@ } ; GCN-LABEL: {{^}}mad_u64_u32_zextops_commute: -; CI: v_mad_u64_u32 v[0:1], s[6:7], v0, v1, v[2:3] +; CI: v_mad_u64_u32 v[0:1], s{{\[[0-9]+:[0-9]+\]}}, v0, v1, v[2:3] ; SI-DAG: v_mul_lo_u32 ; SI-DAG: v_mul_hi_u32 @@ -85,7 +85,7 @@ ; GCN-LABEL: {{^}}mad_i64_i32_sextops_i32_i63: ; CI: v_lshl_b64 ; CI: v_ashr -; CI: v_mad_i64_i32 v[0:1], s[6:7], v0, v1, v[2:3] +; CI: v_mad_i64_i32 v[0:1], s{{\[[0-9]+:[0-9]+\]}}, v0, v1, v[2:3] ; SI-NOT: v_mad_u64_u32 define i63 @mad_i64_i32_sextops_i32_i63(i32 %arg0, i32 %arg1, i63 %arg2) #0 { @@ -101,7 +101,7 @@ ; CI: v_bfe_i32 v[[B1:[0-9]+]], v1, 0, 31 ; CI: v_ashr_i64 ; CI: v_bfe_i32 v[[B2:[0-9]+]], v0, 0, 31 -; CI: v_mad_i64_i32 v[0:1], s[6:7], v[[B2]], v[[B1]], v[1:2] +; CI: v_mad_i64_i32 v[0:1], s{{\[[0-9]+:[0-9]+\]}}, v[[B2]], v[[B1]], v[1:2] define i63 @mad_i64_i32_sextops_i31_i63(i31 %arg0, i31 %arg1, i63 %arg2) #0 { %sext0 = sext i31 %arg0 to i63 %sext1 = sext i31 %arg1 to i63 @@ -111,7 +111,7 @@ } ; GCN-LABEL: {{^}}mad_u64_u32_bitops: -; CI: v_mad_u64_u32 v[0:1], s[6:7], v0, v2, v[4:5] +; CI: v_mad_u64_u32 v[0:1], s{{\[[0-9]+:[0-9]+\]}}, v0, v2, v[4:5] define i64 @mad_u64_u32_bitops(i64 %arg0, i64 %arg1, i64 %arg2) #0 { %trunc.lhs = and i64 %arg0, 4294967295 %trunc.rhs = and i64 %arg1, 4294967295 @@ -141,7 +141,7 @@ } ; GCN-LABEL: {{^}}mad_i64_i32_bitops: -; CI: v_mad_i64_i32 v[0:1], s[6:7], v0, v2, v[4:5] +; CI: v_mad_i64_i32 v[0:1], s{{\[[0-9]+:[0-9]+\]}}, v0, v2, v[4:5] ; SI-NOT: v_mad_ define i64 @mad_i64_i32_bitops(i64 %arg0, i64 %arg1, i64 %arg2) #0 { %shl.lhs = shl i64 %arg0, 32 @@ -155,7 +155,7 @@ ; Example from bug report ; GCN-LABEL: {{^}}mad_i64_i32_unpack_i64ops: -; CI: v_mad_u64_u32 v[0:1], s[6:7], v1, v0, v[0:1] +; CI: v_mad_u64_u32 v[0:1], s{{\[[0-9]+:[0-9]+\]}}, v1, v0, v[0:1] ; SI-NOT: v_mad_u64_u32 define i64 @mad_i64_i32_unpack_i64ops(i64 %arg0) #0 { %tmp4 = lshr i64 %arg0, 32 Index: test/CodeGen/AMDGPU/nested-calls.ll =================================================================== --- test/CodeGen/AMDGPU/nested-calls.ll +++ test/CodeGen/AMDGPU/nested-calls.ll @@ -9,28 +9,29 @@ ; GCN-LABEL: {{^}}test_func_call_external_void_func_i32_imm: ; GCN: s_waitcnt -; GCN: s_mov_b32 s5, s32 -; GCN-DAG: s_add_u32 s32, s32, 0x400 + ; Spill CSR VGPR used for SGPR spilling ; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] - -; GCN-DAG: v_writelane_b32 v32, s34, 0 -; GCN-DAG: v_writelane_b32 v32, s35, 1 -; GCN-DAG: v_writelane_b32 v32, s36, 2 +; GCN-DAG: v_writelane_b32 v32, s34, 2 +; GCN-DAG: s_mov_b32 s34, s32 +; GCN-DAG: s_add_u32 s32, s32, 0x400 +; GCN-DAG: v_writelane_b32 v32, s36, 0 +; GCN-DAG: v_writelane_b32 v32, s37, 1 ; GCN: s_swappc_b64 -; GCN: v_readlane_b32 s36, v32, 2 -; GCN: v_readlane_b32 s35, v32, 1 -; GCN: v_readlane_b32 s34, v32, 0 +; GCN: v_readlane_b32 s37, v32, 1 +; GCN: v_readlane_b32 s36, v32, 0 + +; GCN-NEXT: s_sub_u32 s32, s32, 0x400 +; GCN-NEXT: v_readlane_b32 s34, v32, 2 ; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] - -; GCN: s_sub_u32 s32, s32, 0x400 -; GCN: s_setpc_b64 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 define void @test_func_call_external_void_func_i32_imm() #0 { call void @external_void_func_i32(i32 42) ret void @@ -38,9 +39,9 @@ ; GCN-LABEL: {{^}}test_func_call_external_void_func_i32_imm_stack_use: ; GCN: s_waitcnt -; GCN: s_mov_b32 s5, s32 +; GCN: s_mov_b32 s34, s32 ; GCN-DAG: s_add_u32 s32, s32, 0x1400{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s34 offset: ; GCN: s_swappc_b64 ; GCN: s_sub_u32 s32, s32, 0x1400{{$}} ; GCN: s_setpc_b64 Index: test/CodeGen/AMDGPU/shift-i128.ll =================================================================== --- test/CodeGen/AMDGPU/shift-i128.ll +++ test/CodeGen/AMDGPU/shift-i128.ll @@ -8,7 +8,7 @@ ; GCN-NEXT: v_sub_i32_e32 v7, vcc, 64, v4 ; GCN-NEXT: v_lshl_b64 v[5:6], v[2:3], v4 ; GCN-NEXT: v_lshr_b64 v[7:8], v[0:1], v7 -; GCN-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v4 +; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 ; GCN-NEXT: v_or_b32_e32 v7, v5, v7 ; GCN-NEXT: v_subrev_i32_e32 v5, vcc, 64, v4 ; GCN-NEXT: v_or_b32_e32 v8, v6, v8 @@ -17,8 +17,8 @@ ; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v4 ; GCN-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc ; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; GCN-NEXT: v_cndmask_b32_e64 v3, v6, v3, s[6:7] -; GCN-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v3, v6, v3, s[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[4:5] ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -33,7 +33,7 @@ ; GCN-NEXT: v_sub_i32_e32 v7, vcc, 64, v4 ; GCN-NEXT: v_lshr_b64 v[5:6], v[0:1], v4 ; GCN-NEXT: v_lshl_b64 v[7:8], v[2:3], v7 -; GCN-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v4 +; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 ; GCN-NEXT: v_or_b32_e32 v7, v5, v7 ; GCN-NEXT: v_subrev_i32_e32 v5, vcc, 64, v4 ; GCN-NEXT: v_or_b32_e32 v8, v6, v8 @@ -42,8 +42,8 @@ ; GCN-NEXT: v_lshr_b64 v[2:3], v[2:3], v4 ; GCN-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc ; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; GCN-NEXT: v_cndmask_b32_e64 v1, v6, v1, s[6:7] -; GCN-NEXT: v_cndmask_b32_e64 v0, v5, v0, s[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v1, v6, v1, s[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v0, v5, v0, s[4:5] ; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc ; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -64,13 +64,13 @@ ; GCN-NEXT: v_subrev_i32_e32 v10, vcc, 64, v4 ; GCN-NEXT: v_ashr_i64 v[5:6], v[2:3], v4 ; GCN-NEXT: v_ashr_i64 v[2:3], v[2:3], v10 -; GCN-NEXT: v_cmp_gt_u32_e64 s[6:7], 64, v4 +; GCN-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v4 ; GCN-NEXT: v_or_b32_e32 v7, v7, v9 -; GCN-NEXT: v_cndmask_b32_e64 v5, v11, v5, s[6:7] -; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v5, v11, v5, s[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[4:5] ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v7, s[6:7] -; GCN-NEXT: v_cndmask_b32_e64 v6, v11, v6, s[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v7, s[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v6, v11, v6, s[4:5] ; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GCN-NEXT: v_mov_b32_e32 v2, v5 @@ -131,11 +131,11 @@ ; GCN-NEXT: v_lshl_b64 v[4:5], 17, v3 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v0 ; GCN-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc -; GCN-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v0 -; GCN-NEXT: v_cndmask_b32_e64 v3, 0, v2, s[6:7] +; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 +; GCN-NEXT: v_cndmask_b32_e64 v3, 0, v2, s[4:5] ; GCN-NEXT: v_cndmask_b32_e32 v2, v4, v1, vcc ; GCN-NEXT: v_lshl_b64 v[0:1], 17, v0 -; GCN-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[4:5] ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -147,17 +147,17 @@ ; GCN-LABEL: v_lshr_i128_kv: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s7, 0 -; GCN-NEXT: s_movk_i32 s6, 0x41 -; GCN-NEXT: v_lshr_b64 v[2:3], s[6:7], v0 +; GCN-NEXT: s_mov_b32 s5, 0 +; GCN-NEXT: s_movk_i32 s4, 0x41 +; GCN-NEXT: v_lshr_b64 v[2:3], s[4:5], v0 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v0 -; GCN-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v0 +; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc ; GCN-NEXT: v_mov_b32_e32 v2, 0x41 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_setpc_b64 s[30:31] %shl = lshr i128 65, %rhs @@ -170,12 +170,12 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_lshr_b64 v[2:3], 33, v0 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v0 -; GCN-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v0 +; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc ; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[6:7] -; GCN-NEXT: v_cndmask_b32_e64 v0, 33, v0, s[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v0, 33, v0, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_setpc_b64 s[30:31] %shl = ashr i128 33, %rhs @@ -214,42 +214,6 @@ ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm -; GCN-NEXT: .section .rodata,#alloc -; GCN-NEXT: .p2align 6 -; GCN-NEXT: .amdhsa_kernel s_shl_i128_ss -; GCN-NEXT: .amdhsa_group_segment_fixed_size 0 -; GCN-NEXT: .amdhsa_private_segment_fixed_size 0 -; GCN-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1 -; GCN-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0 -; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 0 -; GCN-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1 -; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0 -; GCN-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0 -; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0 -; GCN-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0 -; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 -; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 -; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0 -; GCN-NEXT: .amdhsa_system_sgpr_workgroup_info 0 -; GCN-NEXT: .amdhsa_system_vgpr_workitem_id 0 -; GCN-NEXT: .amdhsa_next_free_vgpr 8 -; GCN-NEXT: .amdhsa_next_free_sgpr 12 -; GCN-NEXT: .amdhsa_reserve_flat_scratch 0 -; GCN-NEXT: .amdhsa_float_round_mode_32 0 -; GCN-NEXT: .amdhsa_float_round_mode_16_64 0 -; GCN-NEXT: .amdhsa_float_denorm_mode_32 0 -; GCN-NEXT: .amdhsa_float_denorm_mode_16_64 3 -; GCN-NEXT: .amdhsa_dx10_clamp 1 -; GCN-NEXT: .amdhsa_ieee_mode 1 -; GCN-NEXT: .amdhsa_exception_fp_ieee_invalid_op 0 -; GCN-NEXT: .amdhsa_exception_fp_denorm_src 0 -; GCN-NEXT: .amdhsa_exception_fp_ieee_div_zero 0 -; GCN-NEXT: .amdhsa_exception_fp_ieee_overflow 0 -; GCN-NEXT: .amdhsa_exception_fp_ieee_underflow 0 -; GCN-NEXT: .amdhsa_exception_fp_ieee_inexact 0 -; GCN-NEXT: .amdhsa_exception_int_div_zero 0 -; GCN-NEXT: .end_amdhsa_kernel -; GCN-NEXT: .text %shift = shl i128 %lhs, %rhs store i128 %shift, i128 addrspace(1)* null ret void @@ -287,42 +251,6 @@ ; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm -; GCN-NEXT: .section .rodata,#alloc -; GCN-NEXT: .p2align 6 -; GCN-NEXT: .amdhsa_kernel s_lshr_i128_ss -; GCN-NEXT: .amdhsa_group_segment_fixed_size 0 -; GCN-NEXT: .amdhsa_private_segment_fixed_size 0 -; GCN-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1 -; GCN-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0 -; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 0 -; GCN-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1 -; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0 -; GCN-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0 -; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0 -; GCN-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0 -; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 -; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 -; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0 -; GCN-NEXT: .amdhsa_system_sgpr_workgroup_info 0 -; GCN-NEXT: .amdhsa_system_vgpr_workitem_id 0 -; GCN-NEXT: .amdhsa_next_free_vgpr 8 -; GCN-NEXT: .amdhsa_next_free_sgpr 12 -; GCN-NEXT: .amdhsa_reserve_flat_scratch 0 -; GCN-NEXT: .amdhsa_float_round_mode_32 0 -; GCN-NEXT: .amdhsa_float_round_mode_16_64 0 -; GCN-NEXT: .amdhsa_float_denorm_mode_32 0 -; GCN-NEXT: .amdhsa_float_denorm_mode_16_64 3 -; GCN-NEXT: .amdhsa_dx10_clamp 1 -; GCN-NEXT: .amdhsa_ieee_mode 1 -; GCN-NEXT: .amdhsa_exception_fp_ieee_invalid_op 0 -; GCN-NEXT: .amdhsa_exception_fp_denorm_src 0 -; GCN-NEXT: .amdhsa_exception_fp_ieee_div_zero 0 -; GCN-NEXT: .amdhsa_exception_fp_ieee_overflow 0 -; GCN-NEXT: .amdhsa_exception_fp_ieee_underflow 0 -; GCN-NEXT: .amdhsa_exception_fp_ieee_inexact 0 -; GCN-NEXT: .amdhsa_exception_int_div_zero 0 -; GCN-NEXT: .end_amdhsa_kernel -; GCN-NEXT: .text %shift = lshr i128 %lhs, %rhs store i128 %shift, i128 addrspace(1)* null ret void @@ -362,42 +290,6 @@ ; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm -; GCN-NEXT: .section .rodata,#alloc -; GCN-NEXT: .p2align 6 -; GCN-NEXT: .amdhsa_kernel s_ashr_i128_ss -; GCN-NEXT: .amdhsa_group_segment_fixed_size 0 -; GCN-NEXT: .amdhsa_private_segment_fixed_size 0 -; GCN-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1 -; GCN-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0 -; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 0 -; GCN-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1 -; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0 -; GCN-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0 -; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0 -; GCN-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0 -; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 -; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 -; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0 -; GCN-NEXT: .amdhsa_system_sgpr_workgroup_info 0 -; GCN-NEXT: .amdhsa_system_vgpr_workitem_id 0 -; GCN-NEXT: .amdhsa_next_free_vgpr 8 -; GCN-NEXT: .amdhsa_next_free_sgpr 12 -; GCN-NEXT: .amdhsa_reserve_flat_scratch 0 -; GCN-NEXT: .amdhsa_float_round_mode_32 0 -; GCN-NEXT: .amdhsa_float_round_mode_16_64 0 -; GCN-NEXT: .amdhsa_float_denorm_mode_32 0 -; GCN-NEXT: .amdhsa_float_denorm_mode_16_64 3 -; GCN-NEXT: .amdhsa_dx10_clamp 1 -; GCN-NEXT: .amdhsa_ieee_mode 1 -; GCN-NEXT: .amdhsa_exception_fp_ieee_invalid_op 0 -; GCN-NEXT: .amdhsa_exception_fp_denorm_src 0 -; GCN-NEXT: .amdhsa_exception_fp_ieee_div_zero 0 -; GCN-NEXT: .amdhsa_exception_fp_ieee_overflow 0 -; GCN-NEXT: .amdhsa_exception_fp_ieee_underflow 0 -; GCN-NEXT: .amdhsa_exception_fp_ieee_inexact 0 -; GCN-NEXT: .amdhsa_exception_int_div_zero 0 -; GCN-NEXT: .end_amdhsa_kernel -; GCN-NEXT: .text %shift = ashr i128 %lhs, %rhs store i128 %shift, i128 addrspace(1)* null ret void @@ -410,41 +302,41 @@ ; GCN-NEXT: v_sub_i32_e32 v18, vcc, 64, v8 ; GCN-NEXT: v_lshl_b64 v[16:17], v[2:3], v8 ; GCN-NEXT: v_lshr_b64 v[18:19], v[0:1], v18 -; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[10:11] -; GCN-NEXT: v_cmp_gt_u64_e64 s[6:7], 64, v[8:9] +; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[10:11] +; GCN-NEXT: v_cmp_gt_u64_e64 s[4:5], 64, v[8:9] ; GCN-NEXT: v_or_b32_e32 v11, v9, v11 ; GCN-NEXT: v_subrev_i32_e32 v9, vcc, 64, v8 ; GCN-NEXT: v_or_b32_e32 v10, v8, v10 ; GCN-NEXT: v_or_b32_e32 v19, v17, v19 ; GCN-NEXT: v_or_b32_e32 v18, v16, v18 ; GCN-NEXT: v_lshl_b64 v[16:17], v[0:1], v9 -; GCN-NEXT: s_and_b64 s[6:7], s[8:9], s[6:7] +; GCN-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] ; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] -; GCN-NEXT: v_cndmask_b32_e64 v9, v17, v19, s[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v9, v17, v19, s[4:5] ; GCN-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc -; GCN-NEXT: v_cndmask_b32_e64 v9, v16, v18, s[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v9, v16, v18, s[4:5] ; GCN-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc ; GCN-NEXT: v_sub_i32_e32 v11, vcc, 64, v12 ; GCN-NEXT: v_lshl_b64 v[9:10], v[6:7], v12 ; GCN-NEXT: v_lshr_b64 v[16:17], v[4:5], v11 -; GCN-NEXT: v_cmp_gt_u64_e64 s[8:9], 64, v[12:13] +; GCN-NEXT: v_cmp_gt_u64_e64 s[6:7], 64, v[12:13] ; GCN-NEXT: v_or_b32_e32 v16, v9, v16 -; GCN-NEXT: v_cmp_eq_u64_e64 s[10:11], 0, v[14:15] +; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[14:15] ; GCN-NEXT: v_subrev_i32_e32 v9, vcc, 64, v12 ; GCN-NEXT: v_or_b32_e32 v11, v10, v17 ; GCN-NEXT: v_lshl_b64 v[9:10], v[4:5], v9 -; GCN-NEXT: s_and_b64 vcc, s[10:11], s[8:9] +; GCN-NEXT: s_and_b64 vcc, s[8:9], s[6:7] ; GCN-NEXT: v_cndmask_b32_e32 v17, v10, v11, vcc ; GCN-NEXT: v_or_b32_e32 v11, v13, v15 ; GCN-NEXT: v_or_b32_e32 v10, v12, v14 ; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v8 ; GCN-NEXT: v_lshl_b64 v[4:5], v[4:5], v12 -; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[10:11] +; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[10:11] ; GCN-NEXT: v_cndmask_b32_e32 v9, v9, v16, vcc -; GCN-NEXT: v_cndmask_b32_e64 v7, v17, v7, s[8:9] -; GCN-NEXT: v_cndmask_b32_e64 v6, v9, v6, s[8:9] -; GCN-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[6:7] -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v7, v17, v7, s[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v6, v9, v6, s[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[4:5] ; GCN-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc ; GCN-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -459,41 +351,41 @@ ; GCN-NEXT: v_sub_i32_e32 v18, vcc, 64, v8 ; GCN-NEXT: v_lshr_b64 v[16:17], v[0:1], v8 ; GCN-NEXT: v_lshl_b64 v[18:19], v[2:3], v18 -; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[10:11] -; GCN-NEXT: v_cmp_gt_u64_e64 s[6:7], 64, v[8:9] +; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[10:11] +; GCN-NEXT: v_cmp_gt_u64_e64 s[4:5], 64, v[8:9] ; GCN-NEXT: v_or_b32_e32 v11, v9, v11 ; GCN-NEXT: v_subrev_i32_e32 v9, vcc, 64, v8 ; GCN-NEXT: v_or_b32_e32 v10, v8, v10 ; GCN-NEXT: v_or_b32_e32 v19, v17, v19 ; GCN-NEXT: v_or_b32_e32 v18, v16, v18 ; GCN-NEXT: v_lshr_b64 v[16:17], v[2:3], v9 -; GCN-NEXT: s_and_b64 s[6:7], s[8:9], s[6:7] +; GCN-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] ; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] -; GCN-NEXT: v_cndmask_b32_e64 v9, v17, v19, s[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v9, v17, v19, s[4:5] ; GCN-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc -; GCN-NEXT: v_cndmask_b32_e64 v9, v16, v18, s[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v9, v16, v18, s[4:5] ; GCN-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc ; GCN-NEXT: v_sub_i32_e32 v11, vcc, 64, v12 ; GCN-NEXT: v_lshr_b64 v[9:10], v[4:5], v12 ; GCN-NEXT: v_lshl_b64 v[16:17], v[6:7], v11 -; GCN-NEXT: v_cmp_gt_u64_e64 s[8:9], 64, v[12:13] +; GCN-NEXT: v_cmp_gt_u64_e64 s[6:7], 64, v[12:13] ; GCN-NEXT: v_or_b32_e32 v16, v9, v16 -; GCN-NEXT: v_cmp_eq_u64_e64 s[10:11], 0, v[14:15] +; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[14:15] ; GCN-NEXT: v_subrev_i32_e32 v9, vcc, 64, v12 ; GCN-NEXT: v_or_b32_e32 v11, v10, v17 ; GCN-NEXT: v_lshr_b64 v[9:10], v[6:7], v9 -; GCN-NEXT: s_and_b64 vcc, s[10:11], s[8:9] +; GCN-NEXT: s_and_b64 vcc, s[8:9], s[6:7] ; GCN-NEXT: v_cndmask_b32_e32 v17, v10, v11, vcc ; GCN-NEXT: v_or_b32_e32 v11, v13, v15 ; GCN-NEXT: v_or_b32_e32 v10, v12, v14 ; GCN-NEXT: v_lshr_b64 v[2:3], v[2:3], v8 ; GCN-NEXT: v_lshr_b64 v[6:7], v[6:7], v12 -; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[10:11] +; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[10:11] ; GCN-NEXT: v_cndmask_b32_e32 v9, v9, v16, vcc -; GCN-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[8:9] -; GCN-NEXT: v_cndmask_b32_e64 v4, v9, v4, s[8:9] -; GCN-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[6:7] -; GCN-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v4, v9, v4, s[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[4:5] ; GCN-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc ; GCN-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -508,44 +400,44 @@ ; GCN-NEXT: v_sub_i32_e32 v18, vcc, 64, v8 ; GCN-NEXT: v_lshr_b64 v[16:17], v[0:1], v8 ; GCN-NEXT: v_lshl_b64 v[18:19], v[2:3], v18 -; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[10:11] -; GCN-NEXT: v_cmp_gt_u64_e64 s[6:7], 64, v[8:9] +; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[10:11] +; GCN-NEXT: v_cmp_gt_u64_e64 s[4:5], 64, v[8:9] ; GCN-NEXT: v_or_b32_e32 v11, v9, v11 ; GCN-NEXT: v_subrev_i32_e32 v9, vcc, 64, v8 ; GCN-NEXT: v_or_b32_e32 v10, v8, v10 ; GCN-NEXT: v_or_b32_e32 v19, v17, v19 ; GCN-NEXT: v_or_b32_e32 v18, v16, v18 ; GCN-NEXT: v_ashr_i64 v[16:17], v[2:3], v9 -; GCN-NEXT: s_and_b64 s[6:7], s[8:9], s[6:7] +; GCN-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] ; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] -; GCN-NEXT: v_cndmask_b32_e64 v9, v17, v19, s[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v9, v17, v19, s[4:5] ; GCN-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc -; GCN-NEXT: v_cndmask_b32_e64 v9, v16, v18, s[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v9, v16, v18, s[4:5] ; GCN-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc ; GCN-NEXT: v_sub_i32_e32 v11, vcc, 64, v12 ; GCN-NEXT: v_lshr_b64 v[9:10], v[4:5], v12 ; GCN-NEXT: v_lshl_b64 v[16:17], v[6:7], v11 -; GCN-NEXT: v_cmp_gt_u64_e64 s[8:9], 64, v[12:13] +; GCN-NEXT: v_cmp_gt_u64_e64 s[6:7], 64, v[12:13] ; GCN-NEXT: v_or_b32_e32 v16, v9, v16 -; GCN-NEXT: v_cmp_eq_u64_e64 s[10:11], 0, v[14:15] +; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[14:15] ; GCN-NEXT: v_subrev_i32_e32 v9, vcc, 64, v12 ; GCN-NEXT: v_or_b32_e32 v11, v10, v17 ; GCN-NEXT: v_ashr_i64 v[9:10], v[6:7], v9 -; GCN-NEXT: s_and_b64 vcc, s[10:11], s[8:9] +; GCN-NEXT: s_and_b64 vcc, s[8:9], s[6:7] ; GCN-NEXT: v_cndmask_b32_e32 v17, v10, v11, vcc ; GCN-NEXT: v_or_b32_e32 v11, v13, v15 ; GCN-NEXT: v_or_b32_e32 v10, v12, v14 -; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[10:11] +; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[10:11] ; GCN-NEXT: v_cndmask_b32_e32 v9, v9, v16, vcc -; GCN-NEXT: v_cndmask_b32_e64 v4, v9, v4, s[8:9] +; GCN-NEXT: v_cndmask_b32_e64 v4, v9, v4, s[6:7] ; GCN-NEXT: v_ashr_i64 v[8:9], v[2:3], v8 ; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v3 -; GCN-NEXT: v_cndmask_b32_e64 v3, v2, v9, s[6:7] -; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v3, v2, v9, s[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[4:5] ; GCN-NEXT: v_ashr_i64 v[8:9], v[6:7], v12 ; GCN-NEXT: v_ashrrev_i32_e32 v6, 31, v7 ; GCN-NEXT: v_cndmask_b32_e32 v7, v6, v9, vcc -; GCN-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[8:9] +; GCN-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[6:7] ; GCN-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] %shl = ashr <2 x i128> %lhs, %rhs @@ -617,42 +509,6 @@ ; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCN-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; GCN-NEXT: s_endpgm -; GCN-NEXT: .section .rodata,#alloc -; GCN-NEXT: .p2align 6 -; GCN-NEXT: .amdhsa_kernel s_shl_v2i128ss -; GCN-NEXT: .amdhsa_group_segment_fixed_size 0 -; GCN-NEXT: .amdhsa_private_segment_fixed_size 0 -; GCN-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1 -; GCN-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0 -; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 0 -; GCN-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1 -; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0 -; GCN-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0 -; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0 -; GCN-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0 -; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 -; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 -; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0 -; GCN-NEXT: .amdhsa_system_sgpr_workgroup_info 0 -; GCN-NEXT: .amdhsa_system_vgpr_workitem_id 0 -; GCN-NEXT: .amdhsa_next_free_vgpr 16 -; GCN-NEXT: .amdhsa_next_free_sgpr 22 -; GCN-NEXT: .amdhsa_reserve_flat_scratch 0 -; GCN-NEXT: .amdhsa_float_round_mode_32 0 -; GCN-NEXT: .amdhsa_float_round_mode_16_64 0 -; GCN-NEXT: .amdhsa_float_denorm_mode_32 0 -; GCN-NEXT: .amdhsa_float_denorm_mode_16_64 3 -; GCN-NEXT: .amdhsa_dx10_clamp 1 -; GCN-NEXT: .amdhsa_ieee_mode 1 -; GCN-NEXT: .amdhsa_exception_fp_ieee_invalid_op 0 -; GCN-NEXT: .amdhsa_exception_fp_denorm_src 0 -; GCN-NEXT: .amdhsa_exception_fp_ieee_div_zero 0 -; GCN-NEXT: .amdhsa_exception_fp_ieee_overflow 0 -; GCN-NEXT: .amdhsa_exception_fp_ieee_underflow 0 -; GCN-NEXT: .amdhsa_exception_fp_ieee_inexact 0 -; GCN-NEXT: .amdhsa_exception_int_div_zero 0 -; GCN-NEXT: .end_amdhsa_kernel -; GCN-NEXT: .text %shift = shl <2 x i128> %lhs, %rhs store <2 x i128> %shift, <2 x i128> addrspace(1)* null ret void @@ -723,42 +579,6 @@ ; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCN-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; GCN-NEXT: s_endpgm -; GCN-NEXT: .section .rodata,#alloc -; GCN-NEXT: .p2align 6 -; GCN-NEXT: .amdhsa_kernel s_lshr_v2i128_ss -; GCN-NEXT: .amdhsa_group_segment_fixed_size 0 -; GCN-NEXT: .amdhsa_private_segment_fixed_size 0 -; GCN-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1 -; GCN-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0 -; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 0 -; GCN-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1 -; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0 -; GCN-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0 -; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0 -; GCN-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0 -; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 -; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 -; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0 -; GCN-NEXT: .amdhsa_system_sgpr_workgroup_info 0 -; GCN-NEXT: .amdhsa_system_vgpr_workitem_id 0 -; GCN-NEXT: .amdhsa_next_free_vgpr 16 -; GCN-NEXT: .amdhsa_next_free_sgpr 22 -; GCN-NEXT: .amdhsa_reserve_flat_scratch 0 -; GCN-NEXT: .amdhsa_float_round_mode_32 0 -; GCN-NEXT: .amdhsa_float_round_mode_16_64 0 -; GCN-NEXT: .amdhsa_float_denorm_mode_32 0 -; GCN-NEXT: .amdhsa_float_denorm_mode_16_64 3 -; GCN-NEXT: .amdhsa_dx10_clamp 1 -; GCN-NEXT: .amdhsa_ieee_mode 1 -; GCN-NEXT: .amdhsa_exception_fp_ieee_invalid_op 0 -; GCN-NEXT: .amdhsa_exception_fp_denorm_src 0 -; GCN-NEXT: .amdhsa_exception_fp_ieee_div_zero 0 -; GCN-NEXT: .amdhsa_exception_fp_ieee_overflow 0 -; GCN-NEXT: .amdhsa_exception_fp_ieee_underflow 0 -; GCN-NEXT: .amdhsa_exception_fp_ieee_inexact 0 -; GCN-NEXT: .amdhsa_exception_int_div_zero 0 -; GCN-NEXT: .end_amdhsa_kernel -; GCN-NEXT: .text %shift = lshr <2 x i128> %lhs, %rhs store <2 x i128> %shift, <2 x i128> addrspace(1)* null ret void @@ -833,42 +653,6 @@ ; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCN-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; GCN-NEXT: s_endpgm -; GCN-NEXT: .section .rodata,#alloc -; GCN-NEXT: .p2align 6 -; GCN-NEXT: .amdhsa_kernel s_ashr_v2i128_ss -; GCN-NEXT: .amdhsa_group_segment_fixed_size 0 -; GCN-NEXT: .amdhsa_private_segment_fixed_size 0 -; GCN-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1 -; GCN-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0 -; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 0 -; GCN-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1 -; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0 -; GCN-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0 -; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0 -; GCN-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0 -; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 -; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 -; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0 -; GCN-NEXT: .amdhsa_system_sgpr_workgroup_info 0 -; GCN-NEXT: .amdhsa_system_vgpr_workitem_id 0 -; GCN-NEXT: .amdhsa_next_free_vgpr 16 -; GCN-NEXT: .amdhsa_next_free_sgpr 23 -; GCN-NEXT: .amdhsa_reserve_flat_scratch 0 -; GCN-NEXT: .amdhsa_float_round_mode_32 0 -; GCN-NEXT: .amdhsa_float_round_mode_16_64 0 -; GCN-NEXT: .amdhsa_float_denorm_mode_32 0 -; GCN-NEXT: .amdhsa_float_denorm_mode_16_64 3 -; GCN-NEXT: .amdhsa_dx10_clamp 1 -; GCN-NEXT: .amdhsa_ieee_mode 1 -; GCN-NEXT: .amdhsa_exception_fp_ieee_invalid_op 0 -; GCN-NEXT: .amdhsa_exception_fp_denorm_src 0 -; GCN-NEXT: .amdhsa_exception_fp_ieee_div_zero 0 -; GCN-NEXT: .amdhsa_exception_fp_ieee_overflow 0 -; GCN-NEXT: .amdhsa_exception_fp_ieee_underflow 0 -; GCN-NEXT: .amdhsa_exception_fp_ieee_inexact 0 -; GCN-NEXT: .amdhsa_exception_int_div_zero 0 -; GCN-NEXT: .end_amdhsa_kernel -; GCN-NEXT: .text %shift = ashr <2 x i128> %lhs, %rhs store <2 x i128> %shift, <2 x i128> addrspace(1)* null ret void Index: test/CodeGen/AMDGPU/sibling-call.ll =================================================================== --- test/CodeGen/AMDGPU/sibling-call.ll +++ test/CodeGen/AMDGPU/sibling-call.ll @@ -202,36 +202,38 @@ ; Have another non-tail in the function ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_other_call: -; GCN: s_mov_b32 s5, s32 -; GCN: s_add_u32 s32, s32, 0x400 - ; GCN: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1 -; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s5 offset:8 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec +; GCN: s_mov_b32 s34, s32 +; GCN-DAG: s_add_u32 s32, s32, 0x400 -; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill -; GCN: buffer_store_dword v33, off, s[0:3], s5 ; 4-byte Folded Spill -; GCN-DAG: v_writelane_b32 v34, s34, 0 -; GCN-DAG: v_writelane_b32 v34, s35, 1 +; GCN-DAG: buffer_store_dword v32, off, s[0:3], s34 offset:4 ; 4-byte Folded Spill +; GCN-DAG: buffer_store_dword v33, off, s[0:3], s34 ; 4-byte Folded Spill +; GCN-DAG: v_writelane_b32 v34, s36, 0 +; GCN-DAG: v_writelane_b32 v34, s37, 1 -; GCN-DAG: s_getpc_b64 -; GCN: s_swappc_b64 +; GCN-DAG: s_getpc_b64 s[4:5] +; GCN-DAG: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 +; GCN-DAG: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+4 +; GCN-DAG: s_swappc_b64 -; GCN: s_getpc_b64 s[6:7] -; GCN: s_add_u32 s6, s6, sibling_call_i32_fastcc_i32_i32@rel32@lo+4 -; GCN: s_addc_u32 s7, s7, sibling_call_i32_fastcc_i32_i32@rel32@hi+4 +; GCN: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, sibling_call_i32_fastcc_i32_i32@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, sibling_call_i32_fastcc_i32_i32@rel32@hi+4 -; GCN-DAG: v_readlane_b32 s34, v34, 0 -; GCN-DAG: v_readlane_b32 s35, v34, 1 +; GCN-DAG: v_readlane_b32 s36, v34, 0 +; GCN-DAG: v_readlane_b32 s37, v34, 1 -; GCN: buffer_load_dword v33, off, s[0:3], s5 ; 4-byte Folded Reload -; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload -; GCN: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s5 offset:8 -; GCN-NEXT: s_mov_b64 exec +; GCN: buffer_load_dword v33, off, s[0:3], s34 ; 4-byte Folded Reload +; GCN: buffer_load_dword v32, off, s[0:3], s34 offset:4 ; 4-byte Folded Reload ; GCN: s_sub_u32 s32, s32, 0x400 -; GCN: s_setpc_b64 s[6:7] +; GCN-NEXT: v_readlane_b32 s34, +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_setpc_b64 s[4:5] define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i32 %c) #1 { entry: %other.call = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b) @@ -249,7 +251,7 @@ ; GCN-NOT: s33 ; GCN: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset: -; GCN: s_setpc_b64 s[6:7] +; GCN: s_setpc_b64 s[4:5] define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 { entry: %alloca = alloca [16 x i32], align 4, addrspace(5) @@ -264,7 +266,7 @@ ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:44 ; GCN-NOT: s33 -; GCN: s_setpc_b64 s[6:7] +; GCN: s_setpc_b64 s[4:5] define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area(i32 %a, i32 %b, [36 x i32] %c) #1 { entry: %alloca = alloca [16 x i32], align 4, addrspace(5) Index: test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll =================================================================== --- test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll +++ test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll @@ -1,16 +1,19 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stress-regalloc=1 < %s | FileCheck -check-prefix=GCN %s -; For the CSR copy of s5, it may be possible to see it in -; storeRegToStackSlot. - ; GCN-LABEL: {{^}}spill_csr_s5_copy: -; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill -; GCN: v_writelane_b32 v32, s5, 2 +; GCN: s_or_saveexec_b64 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec +; GCN: v_writelane_b32 v32, s34, 2 ; GCN: s_swappc_b64 -; GCN: v_readlane_b32 s5, v32, 2 + ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 9 -; GCN: buffer_store_dword [[K]], off, s[0:3], s5{{$}} -; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload +; GCN: buffer_store_dword [[K]], off, s[0:3], s34{{$}} + +; GCN: v_readlane_b32 s34, v32, 2 +; GCN: s_or_saveexec_b64 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN: s_mov_b64 exec ; GCN: s_setpc_b64 define void @spill_csr_s5_copy() #0 { bb: Index: test/CodeGen/AMDGPU/spill-offset-calculation.ll =================================================================== --- test/CodeGen/AMDGPU/spill-offset-calculation.ll +++ test/CodeGen/AMDGPU/spill-offset-calculation.ll @@ -292,5 +292,5 @@ } attributes #0 = { nounwind } -attributes #1 = { nounwind "amdgpu-num-sgpr"="18" "amdgpu-num-vgpr"="8" } -attributes #2 = { nounwind "amdgpu-num-sgpr"="15" "amdgpu-num-vgpr"="8" } +attributes #1 = { nounwind "amdgpu-num-sgpr"="17" "amdgpu-num-vgpr"="8" } +attributes #2 = { nounwind "amdgpu-num-sgpr"="14" "amdgpu-num-vgpr"="8" } Index: test/CodeGen/AMDGPU/stack-realign.ll =================================================================== --- test/CodeGen/AMDGPU/stack-realign.ll +++ test/CodeGen/AMDGPU/stack-realign.ll @@ -34,7 +34,7 @@ ; GCN-LABEL: {{^}}needs_align16_stack_align4: ; GCN: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0x3c0{{$}} -; GCN: s_and_b32 s5, [[SCRATCH_REG]], 0xfffffc00 +; GCN: s_and_b32 s34, [[SCRATCH_REG]], 0xfffffc00 ; GCN: s_add_u32 s32, s32, 0x2800{{$}} ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen @@ -55,7 +55,7 @@ ; GCN-LABEL: {{^}}needs_align32: ; GCN: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0x7c0{{$}} -; GCN: s_and_b32 s5, [[SCRATCH_REG]], 0xfffff800 +; GCN: s_and_b32 s34, [[SCRATCH_REG]], 0xfffff800 ; GCN: s_add_u32 s32, s32, 0x3000{{$}} ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen @@ -76,7 +76,7 @@ ; GCN-LABEL: {{^}}force_realign4: ; GCN: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0xc0{{$}} -; GCN: s_and_b32 s5, [[SCRATCH_REG]], 0xffffff00 +; GCN: s_and_b32 s34, [[SCRATCH_REG]], 0xffffff00 ; GCN: s_add_u32 s32, s32, 0xd00{{$}} ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen @@ -129,11 +129,13 @@ ; GCN-LABEL: {{^}}default_realign_align128: ; GCN: s_add_u32 [[TMP:s[0-9]+]], s32, 0x1fc0 -; GCN-NEXT: s_and_b32 s5, [[TMP]], 0xffffe000 +; GCN-NEXT: s_mov_b32 [[FP_COPY:s[0-9]+]], s34 +; GCN-NEXT: s_and_b32 s34, [[TMP]], 0xffffe000 ; GCN-NEXT: s_add_u32 s32, s32, 0x4000 -; GCN-NOT: s5 -; GCN: buffer_store_dword v0, off, s[0:3], s5{{$}} +; GCN-NOT: s34 +; GCN: buffer_store_dword v0, off, s[0:3], s34{{$}} ; GCN: s_sub_u32 s32, s32, 0x4000 +; GCN: s_mov_b32 s34, [[FP_COPY]] define void @default_realign_align128(i32 %idx) #0 { %alloca.align = alloca i32, align 128, addrspace(5) store volatile i32 9, i32 addrspace(5)* %alloca.align, align 128 Index: test/CodeGen/AMDGPU/wave32.ll =================================================================== --- test/CodeGen/AMDGPU/wave32.ll +++ test/CodeGen/AMDGPU/wave32.ll @@ -1068,35 +1068,37 @@ ; GCN-LABEL: {{^}}callee_no_stack_with_call: ; GCN: s_waitcnt -; GCN: s_mov_b32 s5, s32 -; GFX1064: s_add_u32 s32, s32, 0x400 -; GFX1032: s_add_u32 s32, s32, 0x200 - -; GFX1064: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; GFX1032: s_or_saveexec_b32 [[COPY_EXEC0:s[0-9]]], -1{{$}} - -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt_vscnt +; GFX1064-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} +; GFX1032-NEXT: s_or_saveexec_b32 [[COPY_EXEC0:s[0-9]]], -1{{$}} +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX1064-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] ; GFX1032-NEXT: s_mov_b32 exec_lo, [[COPY_EXEC0]] -; GCN-DAG: v_writelane_b32 v32, s34, 0 -; GCN-DAG: v_writelane_b32 v32, s35, 1 -; GCN-DAG: s_mov_b32 [[COPY_FP:s[0-9]+]], s5 +; GCN-NEXT: v_writelane_b32 v32, s34, 2 +; GCN: s_mov_b32 s34, s32 +; GFX1064: s_add_u32 s32, s32, 0x400 +; GFX1032: s_add_u32 s32, s32, 0x200 + + +; GCN-DAG: v_writelane_b32 v32, s36, 0 +; GCN-DAG: v_writelane_b32 v32, s37, 1 ; GCN: s_swappc_b64 -; GCN-DAG: s_mov_b32 s5, [[COPY_FP]] -; GCN-DAG: v_readlane_b32 s35, v32, 1 -; GCN-DAG: v_readlane_b32 s34, v32, 0 +; GCN-DAG: v_readlane_b32 s36, v32, 0 +; GCN-DAG: v_readlane_b32 s37, v32, 1 + +; GFX1064: s_sub_u32 s32, s32, 0x400 +; GFX1032: s_sub_u32 s32, s32, 0x200 +; GCN: v_readlane_b32 s34, v32, 2 ; GFX1064: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GFX1032: s_or_saveexec_b32 [[COPY_EXEC1:s[0-9]]], -1{{$}} -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX1064-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] ; GFX1032-NEXT: s_mov_b32 exec_lo, [[COPY_EXEC1]] - -; GFX1064: s_sub_u32 s32, s32, 0x400 -; GFX1032: s_sub_u32 s32, s32, 0x200 -; GCN: s_setpc_b64 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 define void @callee_no_stack_with_call() #1 { call void @external_void_func_void() ret void