diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -6701,16 +6701,13 @@ 1. SGPR33 is used as a frame pointer (FP) if necessary. Like the SP it is an unswizzled scratch address. It is only needed if runtime sized ``alloca`` are used, or for the reasons defined in ``SIFrameLowering``. -2. Runtime stack alignment is not currently supported. +2. Runtime stack alignment is supported. SGPR34 is used as a base pointer (BP) + to access the incoming stack arguments in the function. The BP is needed + only when the function requie the runtime stack alignment. - .. TODO:: - - - If runtime stack alignment is supported, then will an extra argument - pointer register be used? - -2. Allocating SGPR arguments on the stack are not supported. +3. Allocating SGPR arguments on the stack are not supported. -3. No CFI is currently generated. See +4. No CFI is currently generated. See :ref:`amdgpu-dwarf-call-frame-information`. ..note:: @@ -6729,12 +6726,12 @@ local variables and register spill slots are accessed as positive offsets relative to ``DW_AT_frame_base``. -4. Function argument passing is implemented by copying the input physical +5. Function argument passing is implemented by copying the input physical registers to virtual registers on entry. The register allocator can spill if necessary. These are copied back to physical registers at call sites. The net effect is that each function call can have these values in entirely distinct locations. The IPRA can help avoid shuffling argument registers. -5. Call sites are implemented by setting up the arguments at positive offsets +6. Call sites are implemented by setting up the arguments at positive offsets from SP. Then SP is incremented to account for the known frame size before the call and decremented after the call. @@ -6743,7 +6740,7 @@ The CFI will reflect the changed calculation needed to compute the CFA from SP. -6. 4 byte spill slots are used in the stack frame. One slot is allocated for an +7. 4 byte spill slots are used in the stack frame. One slot is allocated for an emergency spill slot. Buffer instructions are used for stack accesses and not the ``flat_scratch`` instruction. diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -73,11 +73,56 @@ return MCRegister(); } -static MCPhysReg findUnusedSGPRNonCalleeSaved(MachineRegisterInfo &MRI) { - LivePhysRegs LiveRegs; - LiveRegs.init(*MRI.getTargetRegisterInfo()); - return findScratchNonCalleeSaveRegister( - MRI, LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass, true); +static void getVGPRSpillLaneOrTempRegister(MachineFunction &MF, + LivePhysRegs &LiveRegs, + Register &TempSGPR, + Optional &FrameIndex, + bool IsFP) { + SIMachineFunctionInfo *MFI = MF.getInfo(); + +#ifndef NDEBUG + const GCNSubtarget &ST = MF.getSubtarget(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); +#endif + + if (MFI->haveFreeLanesForSGPRSpill(MF, 1)) { + int NewFI = MF.getFrameInfo().CreateStackObject(4, 4, true, nullptr, + TargetStackID::SGPRSpill); + + // If there is already a VGPR with free lanes, use it. We may already have + // to pay the penalty for spilling a CSR VGPR. + if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI)) + llvm_unreachable("allocate SGPR spill should have worked"); + + FrameIndex = NewFI; + + LLVM_DEBUG(auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front(); + dbgs() << "Spilling " << (IsFP ? "FP" : "BP") << " to " + << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane + << '\n'); + return; + } + + TempSGPR = findScratchNonCalleeSaveRegister( + MF.getRegInfo(), LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass, true); + + if (!TempSGPR) { + // There's no free lane to spill, and no free register to save FP/BP, + // so we're forced to spill another VGPR to use for the spill. + int NewFI = MF.getFrameInfo().CreateStackObject(4, 4, true, nullptr, + TargetStackID::SGPRSpill); + if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI)) + llvm_unreachable("allocate SGPR spill should have worked"); + FrameIndex = NewFI; + + LLVM_DEBUG(auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front(); + dbgs() << (IsFP ? "FP" : "BP") << " requires fallback spill to " + << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane + << '\n';); + } else { + LLVM_DEBUG(dbgs() << "Saving " << (IsFP ? "FP" : "BP") << " with copy to " + << printReg(TempSGPR, TRI) << '\n'); + } } // We need to specially emit stack operations here because a different frame @@ -597,12 +642,15 @@ Register StackPtrReg = FuncInfo->getStackPtrOffsetReg(); Register FramePtrReg = FuncInfo->getFrameOffsetReg(); + Register BasePtrReg = + TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register(); LivePhysRegs LiveRegs; MachineBasicBlock::iterator MBBI = MBB.begin(); DebugLoc DL; bool HasFP = false; + bool HasBP = false; uint32_t NumBytes = MFI.getStackSize(); uint32_t RoundedSize = NumBytes; // To avoid clobbering VGPRs in lanes that weren't active on function entry, @@ -614,9 +662,32 @@ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FuncInfo->SGPRForFPSaveRestoreCopy) .addReg(FramePtrReg) .setMIFlag(MachineInstr::FrameSetup); - // Make the register live throughout the function. - for (MachineBasicBlock &MBB : MF) - MBB.addLiveIn(FuncInfo->SGPRForFPSaveRestoreCopy); + } + + // Emit the copy if we need a BP, and are using a free SGPR to save it. + if (FuncInfo->SGPRForBPSaveRestoreCopy) { + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), + FuncInfo->SGPRForBPSaveRestoreCopy) + .addReg(BasePtrReg) + .setMIFlag(MachineInstr::FrameSetup); + } + + // If a copy has been emitted for FP and/or BP, Make the SGPRs + // used in the copy instructions live throughout the function. + SmallVector TempSGPRs; + if (FuncInfo->SGPRForFPSaveRestoreCopy) + TempSGPRs.push_back(FuncInfo->SGPRForFPSaveRestoreCopy); + + if (FuncInfo->SGPRForBPSaveRestoreCopy) + TempSGPRs.push_back(FuncInfo->SGPRForBPSaveRestoreCopy); + + if (!TempSGPRs.empty()) { + for (MachineBasicBlock &MBB : MF) { + for (MCPhysReg Reg : TempSGPRs) + MBB.addLiveIn(Reg); + + MBB.sortUniqueLiveIns(); + } } for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg @@ -630,12 +701,16 @@ LiveRegs.addLiveIns(MBB); if (FuncInfo->SGPRForFPSaveRestoreCopy) LiveRegs.removeReg(FuncInfo->SGPRForFPSaveRestoreCopy); + + if (FuncInfo->SGPRForBPSaveRestoreCopy) + LiveRegs.removeReg(FuncInfo->SGPRForBPSaveRestoreCopy); } ScratchExecCopy = findScratchNonCalleeSaveRegister(MRI, LiveRegs, *TRI.getWaveMaskRegClass()); - assert(FuncInfo->SGPRForFPSaveRestoreCopy != ScratchExecCopy); + assert(FuncInfo->SGPRForFPSaveRestoreCopy != ScratchExecCopy && + FuncInfo->SGPRForBPSaveRestoreCopy != ScratchExecCopy); const unsigned OrSaveExec = ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64; @@ -677,6 +752,23 @@ .addReg(Spill[0].VGPR, RegState::Undef); } + if (FuncInfo->BasePointerSaveIndex) { + const int BasePtrFI = FuncInfo->BasePointerSaveIndex.getValue(); + assert(!MFI.isDeadObjectIndex(BasePtrFI) && + MFI.getStackID(BasePtrFI) == TargetStackID::SGPRSpill); + ArrayRef Spill = + FuncInfo->getSGPRToVGPRSpills(BasePtrFI); + assert(Spill.size() == 1); + + // Save BP before setting it up. + // FIXME: This should respect spillSGPRToVGPR; + BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), + Spill[0].VGPR) + .addReg(BasePtrReg) + .addImm(Spill[0].Lane) + .addReg(Spill[0].VGPR, RegState::Undef); + } + if (TRI.needsStackRealignment(MF)) { HasFP = true; const unsigned Alignment = MFI.getMaxAlign().value(); @@ -686,12 +778,14 @@ LiveRegs.init(TRI); LiveRegs.addLiveIns(MBB); LiveRegs.addReg(FuncInfo->SGPRForFPSaveRestoreCopy); + LiveRegs.addReg(FuncInfo->SGPRForBPSaveRestoreCopy); } Register ScratchSPReg = findScratchNonCalleeSaveRegister( MRI, LiveRegs, AMDGPU::SReg_32_XM0RegClass); assert(ScratchSPReg != AMDGPU::NoRegister && - ScratchSPReg != FuncInfo->SGPRForFPSaveRestoreCopy); + ScratchSPReg != FuncInfo->SGPRForFPSaveRestoreCopy && + ScratchSPReg != FuncInfo->SGPRForBPSaveRestoreCopy); // s_add_u32 tmp_reg, s32, NumBytes // s_and_b32 s32, tmp_reg, 0b111...0000 @@ -705,15 +799,21 @@ .setMIFlag(MachineInstr::FrameSetup); FuncInfo->setIsStackRealigned(true); } else if ((HasFP = hasFP(MF))) { - // If we need a base pointer, set it up here. It's whatever the value of - // the stack pointer is at this point. Any variable size objects will be - // allocated after this, so we can still use the base pointer to reference - // locals. BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg) .addReg(StackPtrReg) .setMIFlag(MachineInstr::FrameSetup); } + // If we need a base pointer, set it up here. It's whatever the value of + // the stack pointer is at this point. Any variable size objects will be + // allocated after this, so we can still use the base pointer to reference + // locals. + if ((HasBP = TRI.hasBasePointer(MF))) { + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), TRI.getBaseRegister()) + .addReg(StackPtrReg) + .setMIFlag(MachineInstr::FrameSetup); + } + if (HasFP && RoundedSize != 0) { BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg) .addReg(StackPtrReg) @@ -728,6 +828,14 @@ assert((HasFP || (!FuncInfo->SGPRForFPSaveRestoreCopy && !FuncInfo->FramePointerSaveIndex)) && "Saved FP but didn't need it"); + + assert((!HasBP || (FuncInfo->SGPRForBPSaveRestoreCopy || + FuncInfo->BasePointerSaveIndex)) && + "Needed to save BP but didn't save it anywhere"); + + assert((HasBP || (!FuncInfo->SGPRForBPSaveRestoreCopy && + !FuncInfo->BasePointerSaveIndex)) && + "Saved BP but didn't need it"); } void SIFrameLowering::emitEpilogue(MachineFunction &MF, @@ -739,6 +847,7 @@ const GCNSubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); + const SIRegisterInfo &TRI = TII->getRegisterInfo(); MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); LivePhysRegs LiveRegs; DebugLoc DL; @@ -763,6 +872,12 @@ .setMIFlag(MachineInstr::FrameSetup); } + if (FuncInfo->SGPRForBPSaveRestoreCopy) { + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), TRI.getBaseRegister()) + .addReg(FuncInfo->SGPRForBPSaveRestoreCopy) + .setMIFlag(MachineInstr::FrameSetup); + } + if (FuncInfo->FramePointerSaveIndex) { const int FI = FuncInfo->FramePointerSaveIndex.getValue(); @@ -778,13 +893,27 @@ .addImm(Spill[0].Lane); } + if (FuncInfo->BasePointerSaveIndex) { + const int BasePtrFI = FuncInfo->BasePointerSaveIndex.getValue(); + + assert(!MF.getFrameInfo().isDeadObjectIndex(BasePtrFI) && + MF.getFrameInfo().getStackID(BasePtrFI) == TargetStackID::SGPRSpill); + + ArrayRef Spill = + FuncInfo->getSGPRToVGPRSpills(BasePtrFI); + assert(Spill.size() == 1); + BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), + TRI.getBaseRegister()) + .addReg(Spill[0].VGPR) + .addImm(Spill[0].Lane); + } + Register ScratchExecCopy; for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg : FuncInfo->getSGPRSpillVGPRs()) { if (!Reg.FI.hasValue()) continue; - const SIRegisterInfo &TRI = TII->getRegisterInfo(); if (ScratchExecCopy == AMDGPU::NoRegister) { // See emitPrologue if (LiveRegs.empty()) { @@ -832,12 +961,14 @@ #ifndef NDEBUG static bool allSGPRSpillsAreDead(const MachineFrameInfo &MFI, - Optional FramePointerSaveIndex) { + Optional FramePointerSaveIndex, + Optional BasePointerSaveIndex) { for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); I != E; ++I) { if (!MFI.isDeadObjectIndex(I) && MFI.getStackID(I) == TargetStackID::SGPRSpill && - FramePointerSaveIndex && I != FramePointerSaveIndex) { + ((FramePointerSaveIndex && I != FramePointerSaveIndex) || + (BasePointerSaveIndex && I != BasePointerSaveIndex))) { return false; } } @@ -864,7 +995,7 @@ SIMachineFunctionInfo *FuncInfo = MF.getInfo(); FuncInfo->removeDeadFrameIndices(MFI); - assert(allSGPRSpillsAreDead(MFI, None) && + assert(allSGPRSpillsAreDead(MFI, None, None) && "SGPR spill should have been removed in SILowerSGPRSpills"); // FIXME: The other checks should be redundant with allStackObjectsAreDead, @@ -920,46 +1051,19 @@ for (auto SSpill : MFI->getSGPRSpillVGPRs()) SavedVGPRs.reset(SSpill.VGPR); - const bool HasFP = WillHaveFP || hasFP(MF); - if (!HasFP) - return; - - if (MFI->haveFreeLanesForSGPRSpill(MF, 1)) { - int NewFI = MF.getFrameInfo().CreateStackObject(4, 4, true, nullptr, - TargetStackID::SGPRSpill); - - // If there is already a VGPR with free lanes, use it. We may already have - // to pay the penalty for spilling a CSR VGPR. - if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI)) - llvm_unreachable("allocate SGPR spill should have worked"); - - MFI->FramePointerSaveIndex = NewFI; + LivePhysRegs LiveRegs; + LiveRegs.init(*TRI); - LLVM_DEBUG( - auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front(); - dbgs() << "Spilling FP to " << printReg(Spill.VGPR, TRI) - << ':' << Spill.Lane << '\n'); - return; + if (WillHaveFP || hasFP(MF)) { + getVGPRSpillLaneOrTempRegister(MF, LiveRegs, MFI->SGPRForFPSaveRestoreCopy, + MFI->FramePointerSaveIndex, true); } - MFI->SGPRForFPSaveRestoreCopy = findUnusedSGPRNonCalleeSaved(MF.getRegInfo()); - - if (!MFI->SGPRForFPSaveRestoreCopy) { - // There's no free lane to spill, and no free register to save FP, so we're - // forced to spill another VGPR to use for the spill. - int NewFI = MF.getFrameInfo().CreateStackObject(4, 4, true, nullptr, - TargetStackID::SGPRSpill); - if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI)) - llvm_unreachable("allocate SGPR spill should have worked"); - MFI->FramePointerSaveIndex = NewFI; - - LLVM_DEBUG( - auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front(); - dbgs() << "FP requires fallback spill to " << printReg(Spill.VGPR, TRI) - << ':' << Spill.Lane << '\n';); - } else { - LLVM_DEBUG(dbgs() << "Saving FP with copy to " << - printReg(MFI->SGPRForFPSaveRestoreCopy, TRI) << '\n'); + if (TRI->hasBasePointer(MF)) { + if (MFI->SGPRForFPSaveRestoreCopy) + LiveRegs.addReg(MFI->SGPRForFPSaveRestoreCopy); + getVGPRSpillLaneOrTempRegister(MF, LiveRegs, MFI->SGPRForBPSaveRestoreCopy, + MFI->BasePointerSaveIndex, false); } } @@ -986,14 +1090,31 @@ return true; // Early exit if no callee saved registers are modified! const SIMachineFunctionInfo *FuncInfo = MF.getInfo(); - if (!FuncInfo->SGPRForFPSaveRestoreCopy) + if (!FuncInfo->SGPRForFPSaveRestoreCopy && + !FuncInfo->SGPRForBPSaveRestoreCopy) return false; + const GCNSubtarget &ST = MF.getSubtarget(); + const SIRegisterInfo *RI = ST.getRegisterInfo(); + Register FramePtrReg = FuncInfo->getFrameOffsetReg(); + Register BasePtrReg = RI->getBaseRegister(); + unsigned NumModifiedRegs = 0; + + if (FuncInfo->SGPRForFPSaveRestoreCopy) + NumModifiedRegs++; + if (FuncInfo->SGPRForBPSaveRestoreCopy) + NumModifiedRegs++; + for (auto &CS : CSI) { - if (CS.getReg() == FuncInfo->getFrameOffsetReg()) { - if (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister) - CS.setDstReg(FuncInfo->SGPRForFPSaveRestoreCopy); - break; + if (CS.getReg() == FramePtrReg && FuncInfo->SGPRForFPSaveRestoreCopy) { + CS.setDstReg(FuncInfo->SGPRForFPSaveRestoreCopy); + if (--NumModifiedRegs) + break; + } else if (CS.getReg() == BasePtrReg && + FuncInfo->SGPRForBPSaveRestoreCopy) { + CS.setDstReg(FuncInfo->SGPRForBPSaveRestoreCopy); + if (--NumModifiedRegs) + break; } } diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -485,6 +485,11 @@ Register SGPRForFPSaveRestoreCopy; Optional FramePointerSaveIndex; + /// If this is set, an SGPR used for save/restore of the register used for the + /// base pointer. + Register SGPRForBPSaveRestoreCopy; + Optional BasePointerSaveIndex; + public: SIMachineFunctionInfo(const MachineFunction &MF); diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -411,9 +411,9 @@ } void SIMachineFunctionInfo::removeDeadFrameIndices(MachineFrameInfo &MFI) { - // The FP spill hasn't been inserted yet, so keep it around. + // The FP & BP spills haven't been inserted yet, so keep them around. for (auto &R : SGPRToVGPRSpills) { - if (R.first != FramePointerSaveIndex) + if (R.first != FramePointerSaveIndex && R.first != BasePointerSaveIndex) MFI.RemoveStackObject(R.first); } @@ -421,7 +421,7 @@ // ID. for (int i = MFI.getObjectIndexBegin(), e = MFI.getObjectIndexEnd(); i != e; ++i) - if (i != FramePointerSaveIndex) + if (i != FramePointerSaveIndex && i != BasePointerSaveIndex) MFI.setStackID(i, TargetStackID::Default); for (auto &R : VGPRToAGPRSpills) { diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -65,6 +65,9 @@ Register getFrameRegister(const MachineFunction &MF) const override; + bool hasBasePointer(const MachineFunction &MF) const; + Register getBaseRegister() const; + bool canRealignStack(const MachineFunction &MF) const override; bool requiresRegisterScavenging(const MachineFunction &Fn) const override; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -112,6 +112,14 @@ : FuncInfo->getStackPtrOffsetReg(); } +bool SIRegisterInfo::hasBasePointer(const MachineFunction &MF) const { + // When we need stack realignment, we can't reference off of the + // stack pointer, so we reserve a base pointer. + return needsStackRealignment(MF); +} + +Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; } + const uint32_t *SIRegisterInfo::getAllVGPRRegMask() const { return CSR_AMDGPU_AllVGPRs_RegMask; } @@ -297,6 +305,12 @@ assert(!isSubRegister(ScratchRSrcReg, FrameReg)); } + if (hasBasePointer(MF)) { + MCRegister BasePtrReg = getBaseRegister(); + reserveRegisterTuples(Reserved, BasePtrReg); + assert(!isSubRegister(ScratchRSrcReg, BasePtrReg)); + } + for (MCRegister Reg : MFI->WWMReservedRegs) { reserveRegisterTuples(Reserved, Reg); } @@ -1039,7 +1053,9 @@ MachineOperand &FIOp = MI->getOperand(FIOperandNum); int Index = MI->getOperand(FIOperandNum).getIndex(); - Register FrameReg = getFrameRegister(*MF); + Register FrameReg = hasBasePointer(*MF) && FrameInfo.isFixedObjectIndex(Index) + ? getBaseRegister() + : getFrameRegister(*MF); switch (MI->getOpcode()) { // SGPR register spill diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll --- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll @@ -298,11 +298,14 @@ ; GCN-NEXT: s_add_u32 [[SCRATCH:s[0-9]+]], s32, 0x7ffc0 ; GCN-NEXT: s_mov_b32 s4, s33 ; GCN-NEXT: s_and_b32 s33, [[SCRATCH]], 0xfff80000 +; GCN-NEXT: s_mov_b32 s5, s34 +; GCN-NEXT: s_mov_b32 s34, s32 ; GCN-NEXT: s_add_u32 s32, s32, 0x100000 ; GCN-NEXT: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 ; GCN-NEXT: buffer_store_dword [[ZERO]], off, s[0:3], s33 ; GCN-NEXT: s_sub_u32 s32, s32, 0x100000 ; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_mov_b32 s34, s5 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define void @realign_stack_no_fp_elim() #1 { diff --git a/llvm/test/CodeGen/AMDGPU/fix-frame-ptr-reg-copy-livein.ll b/llvm/test/CodeGen/AMDGPU/fix-frame-ptr-reg-copy-livein.ll --- a/llvm/test/CodeGen/AMDGPU/fix-frame-ptr-reg-copy-livein.ll +++ b/llvm/test/CodeGen/AMDGPU/fix-frame-ptr-reg-copy-livein.ll @@ -10,17 +10,17 @@ define i32 @fp_save_restore_in_temp_sgpr(%struct.Data addrspace(5)* nocapture readonly byval(%struct.Data) align 4 %arg) #0 { ; GCN-LABEL: name: fp_save_restore_in_temp_sgpr ; GCN: bb.0.begin: - ; GCN: liveins: $sgpr30_sgpr31, $sgpr7 + ; GCN: liveins: $sgpr7, $sgpr30_sgpr31 ; GCN: $sgpr7 = frame-setup COPY $sgpr33 ; GCN: $sgpr33 = frame-setup COPY $sgpr32 ; GCN: bb.1.lp_end: - ; GCN: liveins: $sgpr6, $vgpr1, $sgpr4_sgpr5, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31, $sgpr7 + ; GCN: liveins: $sgpr6, $sgpr7, $vgpr1, $sgpr4_sgpr5, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31 ; GCN: bb.2.lp_begin: - ; GCN: liveins: $sgpr6, $vgpr1, $sgpr4_sgpr5, $sgpr8_sgpr9, $sgpr30_sgpr31, $sgpr7 + ; GCN: liveins: $sgpr6, $sgpr7, $vgpr1, $sgpr4_sgpr5, $sgpr8_sgpr9, $sgpr30_sgpr31 ; GCN: bb.3.Flow: - ; GCN: liveins: $sgpr6, $vgpr0, $vgpr1, $sgpr4_sgpr5, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31, $sgpr7 + ; GCN: liveins: $sgpr6, $sgpr7, $vgpr0, $vgpr1, $sgpr4_sgpr5, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31 ; GCN: bb.4.end: - ; GCN: liveins: $vgpr0, $sgpr4_sgpr5, $sgpr30_sgpr31, $sgpr7 + ; GCN: liveins: $sgpr7, $vgpr0, $sgpr4_sgpr5, $sgpr30_sgpr31 ; GCN: $sgpr33 = frame-setup COPY $sgpr7 begin: br label %lp_begin diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir --- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir +++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir @@ -27,20 +27,23 @@ liveins: $vgpr1 ; CHECK-LABEL: name: scavenge_sgpr_pei_no_sgprs - ; CHECK: liveins: $vgpr1 + ; CHECK: liveins: $sgpr27, $vgpr1, $vgpr2 ; CHECK: $sgpr27 = frame-setup COPY $sgpr33 + ; CHECK: $vgpr2 = V_WRITELANE_B32_vi $sgpr34, 0, undef $vgpr2 ; CHECK: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 524224, implicit-def $scc ; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc + ; CHECK: $sgpr34 = frame-setup COPY $sgpr32 ; CHECK: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc ; CHECK: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc ; CHECK: $sgpr33 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc ; CHECK: $sgpr33 = S_ADD_U32 killed $sgpr33, 8192, implicit-def $scc - ; CHECK: $vgpr2 = COPY killed $sgpr33 + ; CHECK: $vgpr3 = COPY killed $sgpr33 ; CHECK: $sgpr33 = S_SUB_U32 killed $sgpr33, 8192, implicit-def $scc ; CHECK: $sgpr33 = S_LSHL_B32 $sgpr33, 6, implicit-def $scc - ; CHECK: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 + ; CHECK: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 ; CHECK: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 1572864, implicit-def $scc ; CHECK: $sgpr33 = frame-setup COPY $sgpr27 + ; CHECK: $sgpr34 = V_READLANE_B32_vi $vgpr2, 0 ; CHECK: S_ENDPGM 0, implicit $vcc S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc $vgpr0 = V_OR_B32_e32 %stack.1, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 @@ -69,18 +72,23 @@ liveins: $vgpr1 ; CHECK-LABEL: name: scavenge_sgpr_pei_one_sgpr - ; CHECK: liveins: $vgpr1 + ; CHECK: liveins: $sgpr27, $sgpr29, $vgpr1 ; CHECK: $sgpr27 = frame-setup COPY $sgpr33 + ; CHECK: $sgpr29 = frame-setup COPY $sgpr34 ; CHECK: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 524224, implicit-def $scc ; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc + ; CHECK: $sgpr34 = frame-setup COPY $sgpr32 ; CHECK: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc ; CHECK: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc - ; CHECK: $sgpr29 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc - ; CHECK: $sgpr29 = S_ADD_U32 killed $sgpr29, 8192, implicit-def $scc - ; CHECK: $vgpr2 = COPY killed $sgpr29 + ; CHECK: $sgpr33 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc + ; CHECK: $sgpr33 = S_ADD_U32 killed $sgpr33, 8192, implicit-def $scc + ; CHECK: $vgpr2 = COPY killed $sgpr33 + ; CHECK: $sgpr33 = S_SUB_U32 killed $sgpr33, 8192, implicit-def $scc + ; CHECK: $sgpr33 = S_LSHL_B32 $sgpr33, 6, implicit-def $scc ; CHECK: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr31 ; CHECK: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 1572864, implicit-def $scc ; CHECK: $sgpr33 = frame-setup COPY $sgpr27 + ; CHECK: $sgpr34 = frame-setup COPY $sgpr29 ; CHECK: S_ENDPGM 0, implicit $vcc S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc $vgpr0 = V_OR_B32_e32 %stack.1, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr31 @@ -109,18 +117,21 @@ liveins: $vgpr1 ; CHECK-LABEL: name: scavenge_sgpr_pei_one_sgpr_64 - ; CHECK: liveins: $vgpr1 + ; CHECK: liveins: $sgpr27, $sgpr28, $vgpr1 ; CHECK: $sgpr27 = frame-setup COPY $sgpr33 + ; CHECK: $sgpr28 = frame-setup COPY $sgpr34 ; CHECK: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 524224, implicit-def $scc ; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc + ; CHECK: $sgpr34 = frame-setup COPY $sgpr32 ; CHECK: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc ; CHECK: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc - ; CHECK: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec - ; CHECK: $sgpr28 = S_MOV_B32 8192 - ; CHECK: $vgpr2, dead $sgpr28_sgpr29 = V_ADD_I32_e64 killed $sgpr28, killed $vgpr2, 0, implicit $exec + ; CHECK: $sgpr29 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc + ; CHECK: $sgpr29 = S_ADD_U32 killed $sgpr29, 8192, implicit-def $scc + ; CHECK: $vgpr2 = COPY killed $sgpr29 ; CHECK: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr31 ; CHECK: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 1572864, implicit-def $scc ; CHECK: $sgpr33 = frame-setup COPY $sgpr27 + ; CHECK: $sgpr34 = frame-setup COPY $sgpr28 ; CHECK: S_ENDPGM 0, implicit $vcc S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc $vgpr0 = V_OR_B32_e32 %stack.1, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr31 @@ -148,10 +159,12 @@ liveins: $vgpr1 ; CHECK-LABEL: name: scavenge_sgpr_pei_prefer_vcc - ; CHECK: liveins: $vgpr1 + ; CHECK: liveins: $sgpr27, $sgpr28, $vgpr1 ; CHECK: $sgpr27 = frame-setup COPY $sgpr33 + ; CHECK: $sgpr28 = frame-setup COPY $sgpr34 ; CHECK: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 524224, implicit-def $scc ; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc + ; CHECK: $sgpr34 = frame-setup COPY $sgpr32 ; CHECK: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc ; CHECK: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr30, implicit-def $sgpr31 ; CHECK: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec @@ -160,6 +173,7 @@ ; CHECK: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr31 ; CHECK: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 1572864, implicit-def $scc ; CHECK: $sgpr33 = frame-setup COPY $sgpr27 + ; CHECK: $sgpr34 = frame-setup COPY $sgpr28 ; CHECK: S_ENDPGM 0 S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr30, implicit-def $sgpr31 $vgpr0 = V_OR_B32_e32 %stack.1, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr31 diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir --- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir +++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir @@ -22,17 +22,20 @@ liveins: $vgpr1 ; CHECK-LABEL: name: scavenge_sgpr_pei_no_sgprs - ; CHECK: liveins: $vgpr1 + ; CHECK: liveins: $sgpr27, $vgpr1, $vgpr2 ; CHECK: $sgpr27 = frame-setup COPY $sgpr33 + ; CHECK: $vgpr2 = V_WRITELANE_B32_vi $sgpr34, 0, undef $vgpr2 ; CHECK: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 524224, implicit-def $scc ; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc + ; CHECK: $sgpr34 = frame-setup COPY $sgpr32 ; CHECK: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc ; CHECK: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc - ; CHECK: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec - ; CHECK: $vgpr2 = V_ADD_U32_e32 8192, killed $vgpr2, implicit $exec - ; CHECK: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 + ; CHECK: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec + ; CHECK: $vgpr3 = V_ADD_U32_e32 8192, killed $vgpr3, implicit $exec + ; CHECK: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 ; CHECK: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 1572864, implicit-def $scc ; CHECK: $sgpr33 = frame-setup COPY $sgpr27 + ; CHECK: $sgpr34 = V_READLANE_B32_vi $vgpr2, 0 ; CHECK: S_ENDPGM 0, implicit $vcc S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc $vgpr0 = V_OR_B32_e32 %stack.1, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir --- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir +++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir @@ -22,16 +22,19 @@ liveins: $vgpr1 ; CHECK-LABEL: name: scavenge_sgpr_pei - ; CHECK: liveins: $vgpr1 + ; CHECK: liveins: $sgpr27, $vgpr1, $vgpr2 ; CHECK: $sgpr27 = frame-setup COPY $sgpr33 + ; CHECK: $vgpr2 = V_WRITELANE_B32_gfx6_gfx7 $sgpr34, 0, undef $vgpr2 ; CHECK: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 262080, implicit-def $scc ; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294705152, implicit-def $scc + ; CHECK: $sgpr34 = frame-setup COPY $sgpr32 ; CHECK: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 524288, implicit-def $scc ; CHECK: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc - ; CHECK: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec - ; CHECK: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 + ; CHECK: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec + ; CHECK: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 ; CHECK: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 524288, implicit-def $scc ; CHECK: $sgpr33 = frame-setup COPY $sgpr27 + ; CHECK: $sgpr34 = V_READLANE_B32_gfx6_gfx7 $vgpr2, 0 ; CHECK: S_ENDPGM 0, implicit $vcc S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc $vgpr0 = V_OR_B32_e32 %stack.0, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir --- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir +++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir @@ -25,35 +25,41 @@ ; GFX8-LABEL: name: pei_scavenge_vgpr_spill ; GFX8: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr2 ; GFX8: $vgpr2 = V_WRITELANE_B32_vi $sgpr33, 0, undef $vgpr2 + ; GFX8: $vgpr2 = V_WRITELANE_B32_vi $sgpr34, 1, undef $vgpr2 ; GFX8: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 524224, implicit-def $scc ; GFX8: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc + ; GFX8: $sgpr34 = frame-setup COPY $sgpr32 ; GFX8: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc ; GFX8: $sgpr4 = S_ADD_U32 $sgpr33, 524544, implicit-def $scc - ; GFX8: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.3, addrspace 5) + ; GFX8: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.4, addrspace 5) ; GFX8: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX8: $vcc_lo = S_MOV_B32 8192 ; GFX8: $vgpr3, dead $vcc = V_ADD_I32_e64 killed $vcc_lo, killed $vgpr3, 0, implicit $exec ; GFX8: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec ; GFX8: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 1572864, implicit-def $scc ; GFX8: $sgpr33 = V_READLANE_B32_vi $vgpr2, 0 + ; GFX8: $sgpr34 = V_READLANE_B32_vi $vgpr2, 1 ; GFX8: $sgpr4 = S_ADD_U32 $sgpr33, 524544, implicit-def $scc - ; GFX8: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.3, addrspace 5) + ; GFX8: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.4, addrspace 5) ; GFX8: S_ENDPGM 0, csr_amdgpu_allvgprs ; GFX9-LABEL: name: pei_scavenge_vgpr_spill ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr2 ; GFX9: $vgpr2 = V_WRITELANE_B32_vi $sgpr33, 0, undef $vgpr2 + ; GFX9: $vgpr2 = V_WRITELANE_B32_vi $sgpr34, 1, undef $vgpr2 ; GFX9: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 524224, implicit-def $scc ; GFX9: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc + ; GFX9: $sgpr34 = frame-setup COPY $sgpr32 ; GFX9: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc ; GFX9: $sgpr4 = S_ADD_U32 $sgpr33, 524544, implicit-def $scc - ; GFX9: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.3, addrspace 5) + ; GFX9: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.4, addrspace 5) ; GFX9: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX9: $vgpr3 = V_ADD_U32_e32 8192, killed $vgpr3, implicit $exec ; GFX9: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec ; GFX9: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 1572864, implicit-def $scc ; GFX9: $sgpr33 = V_READLANE_B32_vi $vgpr2, 0 + ; GFX9: $sgpr34 = V_READLANE_B32_vi $vgpr2, 1 ; GFX9: $sgpr4 = S_ADD_U32 $sgpr33, 524544, implicit-def $scc - ; GFX9: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.3, addrspace 5) + ; GFX9: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.4, addrspace 5) ; GFX9: S_ENDPGM 0, csr_amdgpu_allvgprs $vgpr0 = V_OR_B32_e32 %stack.1, $vgpr1, implicit $exec S_ENDPGM 0, csr_amdgpu_allvgprs diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll --- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll @@ -37,12 +37,15 @@ ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen ; GCN: v_or_b32_e32 v{{[0-9]+}}, 12 +; GCN: s_mov_b32 [[BP_COPY:s[0-9]+]], s34 +; GCN: s_mov_b32 s34, s32 ; GCN: s_add_u32 s32, s32, 0x2800{{$}} ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen ; GCN: s_sub_u32 s32, s32, 0x2800 +; GCN: s_mov_b32 s34, [[BP_COPY]] ; GCN: ; ScratchSize: 160 define void @needs_align16_stack_align4(i32 %idx) #2 { @@ -58,12 +61,15 @@ ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen ; GCN: v_or_b32_e32 v{{[0-9]+}}, 12 +; GCN: s_mov_b32 [[BP_COPY:s[0-9]+]], s34 +; GCN: s_mov_b32 s34, s32 ; GCN: s_add_u32 s32, s32, 0x3000{{$}} ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen ; GCN: s_sub_u32 s32, s32, 0x3000 +; GCN: s_mov_b32 s34, [[BP_COPY]] ; GCN: ; ScratchSize: 192 define void @needs_align32(i32 %idx) #0 { @@ -76,10 +82,13 @@ ; GCN-LABEL: {{^}}force_realign4: ; GCN: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0xc0{{$}} ; GCN: s_and_b32 s33, [[SCRATCH_REG]], 0xffffff00 +; GCN: s_mov_b32 [[BP_COPY]], s34 +; GCN: s_mov_b32 s34, s32 ; GCN: s_add_u32 s32, s32, 0xd00{{$}} ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen ; GCN: s_sub_u32 s32, s32, 0xd00 +; GCN: s_mov_b32 s34, [[BP_COPY]] ; GCN: ; ScratchSize: 52 define void @force_realign4(i32 %idx) #1 { @@ -127,11 +136,14 @@ ; GCN: s_add_u32 [[TMP:s[0-9]+]], s32, 0x1fc0 ; GCN-NEXT: s_mov_b32 [[FP_COPY:s[0-9]+]], s33 ; GCN-NEXT: s_and_b32 s33, [[TMP]], 0xffffe000 +; GCN-NEXT: s_mov_b32 [[BP_COPY:s[0-9]+]], s34 +; GCN-NEXT: s_mov_b32 s34, s32 ; GCN-NEXT: s_add_u32 s32, s32, 0x4000 ; GCN-NOT: s33 ; GCN: buffer_store_dword v0, off, s[0:3], s33{{$}} ; GCN: s_sub_u32 s32, s32, 0x4000 ; GCN: s_mov_b32 s33, [[FP_COPY]] +; GCN: s_mov_b32 s34, [[BP_COPY]] define void @default_realign_align128(i32 %idx) #0 { %alloca.align = alloca i32, align 128, addrspace(5) store volatile i32 9, i32 addrspace(5)* %alloca.align, align 128 @@ -148,7 +160,118 @@ ret void } +declare void @extern_func(<32 x i32>, i32) #0 +define void @func_call_align1024_bp_gets_vgpr_spill(<32 x i32> %a, i32 %b) #0 { +; The test forces the stack to be realigned to a new boundary +; since there is a local object with an alignment of 1024. +; Should use BP to access the incoming stack arguments. +; The BP value is saved/restored with a VGPR spill. + +; GCN-LABEL: func_call_align1024_bp_gets_vgpr_spill: +; GCN: buffer_store_dword [[VGPR_REG:v[0-9]+]], off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_writelane_b32 [[VGPR_REG]], s33, 2 +; GCN-NEXT: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0xffc0 +; GCN-NEXT: v_writelane_b32 [[VGPR_REG]], s34, 3 +; GCN-NEXT: s_and_b32 s33, [[SCRATCH_REG]], 0xffff0000 +; GCN-NEXT: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill + +; GCN: s_mov_b32 s34, s32 +; GCN-NEXT: v_mov_b32_e32 v32, 0 + +; GCN: buffer_store_dword v32, off, s[0:3], s33 offset:1024 +; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34 +; GCN-NEXT: s_add_u32 s32, s32, 0x30000 + +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] + +; GCN: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: v_readlane_b32 s33, [[VGPR_REG]], 2 +; GCN-NEXT: s_sub_u32 s32, s32, 0x30000 +; GCN-NEXT: v_readlane_b32 s34, [[VGPR_REG]], 3 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword [[VGPR_REG]], off, s[0:3], s32 offset:1028 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] + %temp = alloca i32, align 1024, addrspace(5) + store volatile i32 0, i32 addrspace(5)* %temp, align 1024 + call void @extern_func(<32 x i32> %a, i32 %b) + ret void +} + +%struct.Data = type { [9 x i32] } +define i32 @needs_align1024_stack_args_used_inside_loop(%struct.Data addrspace(5)* nocapture readonly byval(%struct.Data) align 8 %arg) local_unnamed_addr #4 { +; The local object allocation needed an alignment of 1024. +; Since the function argument is accessed in a loop with an +; index variable, the base pointer first get loaded into a VGPR +; and that value should be further referenced to load the incoming values. +; The BP value will get saved/restored in an SGPR at the prolgoue/epilogue. + +; GCN-LABEL: needs_align1024_stack_args_used_inside_loop: +; GCN: s_mov_b32 [[BP_COPY:s[0-9]+]], s34 +; GCN-NEXT: s_mov_b32 s34, s32 +; GCN-NEXT: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0xffc0 +; GCN-NEXT: s_mov_b32 [[FP_COPY:s[0-9]+]], s33 +; GCN-NEXT: s_and_b32 s33, [[SCRATCH_REG]], 0xffff0000 +; GCN-NEXT: v_mov_b32_e32 v{{[0-9]+}}, 0 +; GCN-NEXT: v_lshrrev_b32_e64 [[VGPR_REG:v[0-9]+]], 6, s34 +; GCN: s_add_u32 s32, s32, 0x30000 +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:1024 +; GCN: buffer_load_dword v{{[0-9]+}}, [[VGPR_REG]], s[0:3], 0 offen +; GCN: v_add_u32_e32 [[VGPR_REG]], vcc, 4, [[VGPR_REG]] +; GCN: s_sub_u32 s32, s32, 0x30000 +; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]] +; GCN-NEXT: s_mov_b32 s34, [[BP_COPY]] +; GCN-NEXT: s_setpc_b64 s[30:31] +begin: + %local_var = alloca i32, align 1024, addrspace(5) + store volatile i32 0, i32 addrspace(5)* %local_var, align 1024 + br label %loop_body + +loop_end: ; preds = %loop_body + %idx_next = add nuw nsw i32 %lp_idx, 1 + %lp_exit_cond = icmp eq i32 %idx_next, 9 + br i1 %lp_exit_cond, label %exit, label %loop_body + +loop_body: ; preds = %loop_end, %begin + %lp_idx = phi i32 [ 0, %begin ], [ %idx_next, %loop_end ] + %ptr = getelementptr inbounds %struct.Data, %struct.Data addrspace(5)* %arg, i32 0, i32 0, i32 %lp_idx + %val = load i32, i32 addrspace(5)* %ptr, align 8 + %lp_cond = icmp eq i32 %val, %lp_idx + br i1 %lp_cond, label %loop_end, label %exit + +exit: ; preds = %loop_end, %loop_body + %out = phi i32 [ 0, %loop_body ], [ 1, %loop_end ] + ret i32 %out +} + +define void @no_free_scratch_sgpr_for_bp_copy(i32 %arg) #0 { +; GCN-LABEL: no_free_scratch_sgpr_for_bp_copy: +; GCN: ; %bb.0: +; GCN: v_writelane_b32 [[VGPR_REG:v[0-9]+]], s34, 3 +; GCN-NEXT: s_add_u32 s4, s32, 0x1fc0 +; GCN-NEXT: v_writelane_b32 [[VGPR_REG]], s30, 0 +; GCN-NEXT: s_and_b32 s33, s4, 0xffffe000 +; GCN: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN: s_mov_b32 s34, s32 +; GCN: s_sub_u32 s32, s32, 0x4000 +; GCN-NEXT: v_readlane_b32 s34, [[VGPR_REG]], 3 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[4:5] + %local_val = alloca i32, align 128, addrspace(5) + store volatile i32 %arg, i32 addrspace(5)* %local_val, align 128 + ; Use all clobberable registers, so BP has to spill to a VGPR. + call void asm sideeffect "", + "~{s0},~{s1},~{s2},~{s3},~{s4},~{s5},~{s6},~{s7},~{s8},~{s9} + ,~{s10},~{s11},~{s12},~{s13},~{s14},~{s15},~{s16},~{s17},~{s18},~{s19} + ,~{s20},~{s21},~{s22},~{s23},~{s24},~{s25},~{s26},~{s27},~{s28},~{s29} + ,~{s30},~{s31}"() #0 + ret void +} + attributes #0 = { noinline nounwind } attributes #1 = { noinline nounwind "stackrealign" } attributes #2 = { noinline nounwind alignstack=4 } attributes #3 = { noinline nounwind "no-realign-stack" } +attributes #4 = { noinline nounwind "frame-pointer"="all"}