diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -1320,16 +1320,14 @@ const BitVector AllSavedRegs = SavedRegs; SavedRegs.clearBitsInMask(TRI->getAllVectorRegMask()); - // If clearing VGPRs changed the mask, we will have some CSR VGPR spills. - const bool HaveAnyCSRVGPR = SavedRegs != AllSavedRegs; - // We have to anticipate introducing CSR VGPR spills or spill of caller // save VGPR reserved for SGPR spills as we now always create stack entry - // for it, if we don't have any stack objects already, since we require - // an FP if there is a call and stack. + // for it, if we don't have any stack objects already, since we require a FP + // if there is a call and stack. We will allocate a VGPR for SGPR spills if + // there are any SGPR spills. Whether they are CSR spills or otherwise. MachineFrameInfo &FrameInfo = MF.getFrameInfo(); const bool WillHaveFP = - FrameInfo.hasCalls() && (HaveAnyCSRVGPR || MFI->VGPRReservedForSGPRSpill); + FrameInfo.hasCalls() && (AllSavedRegs.any() || MFI->hasSpilledSGPRs()); // FP will be specially managed like SP. if (WillHaveFP || hasFP(MF)) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -45,10 +45,6 @@ cl::desc("Do not align and prefetch loops"), cl::init(false)); -static cl::opt VGPRReserveforSGPRSpill( - "amdgpu-reserve-vgpr-for-sgpr-spill", - cl::desc("Allocates one VGPR for future SGPR Spill"), cl::init(true)); - static cl::opt UseDivergentRegisterIndexing( "amdgpu-use-divergent-register-indexing", cl::Hidden, @@ -11990,13 +11986,6 @@ } TargetLoweringBase::finalizeLowering(MF); - - // Allocate a VGPR for future SGPR Spill if - // "amdgpu-reserve-vgpr-for-sgpr-spill" option is used - // FIXME: We won't need this hack if we split SGPR allocation from VGPR - if (VGPRReserveforSGPRSpill && TRI->spillSGPRToVGPR() && - !Info->VGPRReservedForSGPRSpill && !Info->isEntryFunction()) - Info->reserveVGPRforSGPRSpills(MF); } void SITargetLowering::computeKnownBitsForFrameIndex( diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp --- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp @@ -239,50 +239,6 @@ return false; } -// Find lowest available VGPR and use it as VGPR reserved for SGPR spills. -static bool lowerShiftReservedVGPR(MachineFunction &MF, - const GCNSubtarget &ST) { - SIMachineFunctionInfo *FuncInfo = MF.getInfo(); - const Register PreReservedVGPR = FuncInfo->VGPRReservedForSGPRSpill; - // Early out if pre-reservation of a VGPR for SGPR spilling is disabled. - if (!PreReservedVGPR) - return false; - - // If there are no free lower VGPRs available, default to using the - // pre-reserved register instead. - const SIRegisterInfo *TRI = ST.getRegisterInfo(); - Register LowestAvailableVGPR = - TRI->findUnusedRegister(MF.getRegInfo(), &AMDGPU::VGPR_32RegClass, MF); - if (!LowestAvailableVGPR) - LowestAvailableVGPR = PreReservedVGPR; - - MachineFrameInfo &FrameInfo = MF.getFrameInfo(); - // Create a stack object for a possible spill in the function prologue. - // Note Non-CSR VGPR also need this as we may overwrite inactive lanes. - Optional FI = FrameInfo.CreateSpillStackObject(4, Align(4)); - - // Find saved info about the pre-reserved register. - const auto *ReservedVGPRInfoItr = - llvm::find_if(FuncInfo->getSGPRSpillVGPRs(), - [PreReservedVGPR](const auto &SpillRegInfo) { - return SpillRegInfo.VGPR == PreReservedVGPR; - }); - - assert(ReservedVGPRInfoItr != FuncInfo->getSGPRSpillVGPRs().end()); - auto Index = - std::distance(FuncInfo->getSGPRSpillVGPRs().begin(), ReservedVGPRInfoItr); - - FuncInfo->setSGPRSpillVGPRs(LowestAvailableVGPR, FI, Index); - - for (MachineBasicBlock &MBB : MF) { - assert(LowestAvailableVGPR.isValid() && "Did not find an available VGPR"); - MBB.addLiveIn(LowestAvailableVGPR); - MBB.sortUniqueLiveIns(); - } - - return true; -} - bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget(); TII = ST.getInstrInfo(); @@ -304,11 +260,6 @@ if (!MFI.hasStackObjects() && !HasCSRs) { SaveBlocks.clear(); RestoreBlocks.clear(); - if (FuncInfo->VGPRReservedForSGPRSpill) { - // Free the reserved VGPR for later possible use by frame lowering. - FuncInfo->removeVGPRForSGPRSpill(FuncInfo->VGPRReservedForSGPRSpill, MF); - MRI.freezeReservedRegs(MF); - } return false; } @@ -326,8 +277,6 @@ // This operates under the assumption that only other SGPR spills are users // of the frame index. - lowerShiftReservedVGPR(MF, ST); - // To track the spill frame indices handled in this pass. BitVector SpillFIs(MFI.getObjectIndexEnd(), false); @@ -375,8 +324,6 @@ FuncInfo->removeDeadFrameIndices(MFI); MadeChange = true; - } else if (FuncInfo->VGPRReservedForSGPRSpill) { - FuncInfo->removeVGPRForSGPRSpill(FuncInfo->VGPRReservedForSGPRSpill, MF); } SaveBlocks.clear(); diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -502,7 +502,6 @@ Register SGPRForBPSaveRestoreCopy; Optional BasePointerSaveIndex; - Register VGPRReservedForSGPRSpill; bool isCalleeSavedReg(const MCPhysReg *CSRegs, MCPhysReg Reg); public: @@ -528,7 +527,6 @@ void setSGPRSpillVGPRs(Register NewVGPR, Optional newFI, int Index) { SpillVGPRs[Index].VGPR = NewVGPR; SpillVGPRs[Index].FI = newFI; - VGPRReservedForSGPRSpill = NewVGPR; } bool removeVGPRForSGPRSpill(Register ReservedVGPR, MachineFunction &MF); @@ -556,7 +554,6 @@ bool haveFreeLanesForSGPRSpill(const MachineFunction &MF, unsigned NumLane) const; bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI); - bool reserveVGPRforSGPRSpills(MachineFunction &MF); bool allocateVGPRSpillToAGPR(MachineFunction &MF, int FI, bool isAGPRtoVGPR); void removeDeadFrameIndices(MachineFrameInfo &MFI); diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -274,7 +274,6 @@ MachineFrameInfo &FrameInfo = MF.getFrameInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); unsigned WaveSize = ST.getWavefrontSize(); - SIMachineFunctionInfo *FuncInfo = MF.getInfo(); unsigned Size = FrameInfo.getObjectSize(FI); unsigned NumLanes = Size / 4; @@ -291,16 +290,7 @@ Register LaneVGPR; unsigned VGPRIndex = (NumVGPRSpillLanes % WaveSize); - // Reserve a VGPR (when NumVGPRSpillLanes = 0, WaveSize, 2*WaveSize, ..) and - // when one of the two conditions is true: - // 1. One reserved VGPR being tracked by VGPRReservedForSGPRSpill is not yet - // reserved. - // 2. All spill lanes of reserved VGPR(s) are full and another spill lane is - // required. - if (FuncInfo->VGPRReservedForSGPRSpill && NumVGPRSpillLanes < WaveSize) { - assert(FuncInfo->VGPRReservedForSGPRSpill == SpillVGPRs.back().VGPR); - LaneVGPR = FuncInfo->VGPRReservedForSGPRSpill; - } else if (VGPRIndex == 0) { + if (VGPRIndex == 0) { LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF); if (LaneVGPR == AMDGPU::NoRegister) { // We have no VGPRs left for spilling SGPRs. Reset because we will not @@ -308,6 +298,8 @@ SGPRToVGPRSpills.erase(FI); NumVGPRSpillLanes -= I; + // FIXME: We can run out of free registers with split allocation if + // IPRA is enabled and a called function already uses every VGPR. #if 0 DiagnosticInfoResourceLimit DiagOutOfRegs(MF.getFunction(), "VGPRs for SGPR spilling", @@ -340,21 +332,6 @@ return true; } -/// Reserve a VGPR for spilling of SGPRs -bool SIMachineFunctionInfo::reserveVGPRforSGPRSpills(MachineFunction &MF) { - const GCNSubtarget &ST = MF.getSubtarget(); - const SIRegisterInfo *TRI = ST.getRegisterInfo(); - SIMachineFunctionInfo *FuncInfo = MF.getInfo(); - - Register LaneVGPR = TRI->findUnusedRegister( - MF.getRegInfo(), &AMDGPU::VGPR_32RegClass, MF, true); - if (LaneVGPR == Register()) - return false; - SpillVGPRs.push_back(SGPRSpillVGPR(LaneVGPR, None)); - FuncInfo->VGPRReservedForSGPRSpill = LaneVGPR; - return true; -} - /// Reserve AGPRs or VGPRs to support spilling for FrameIndex \p FI. /// Either AGPR is spilled to VGPR to vice versa. /// Returns true if a \p FI can be eliminated completely. @@ -616,24 +593,6 @@ return false; } -// Remove VGPR which was reserved for SGPR spills if there are no spilled SGPRs -bool SIMachineFunctionInfo::removeVGPRForSGPRSpill(Register ReservedVGPR, - MachineFunction &MF) { - for (auto *i = SpillVGPRs.begin(); i < SpillVGPRs.end(); i++) { - if (i->VGPR == ReservedVGPR) { - SpillVGPRs.erase(i); - - for (MachineBasicBlock &MBB : MF) { - MBB.removeLiveIn(ReservedVGPR); - MBB.sortUniqueLiveIns(); - } - this->VGPRReservedForSGPRSpill = AMDGPU::NoRegister; - return true; - } - } - return false; -} - bool SIMachineFunctionInfo::usesAGPRs(const MachineFunction &MF) const { if (UsesAGPRs) return *UsesAGPRs; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll @@ -520,58 +520,58 @@ ; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: global_load_dwordx4 v[3:6], v[0:1], off -; GCN-NEXT: global_load_dwordx4 v[16:19], v[0:1], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[24:27], v[0:1], off offset:48 -; GCN-NEXT: global_load_dwordx4 v[28:31], v[0:1], off offset:64 -; GCN-NEXT: global_load_dwordx4 v[32:35], v[0:1], off offset:80 -; GCN-NEXT: global_load_dwordx4 v[36:39], v[0:1], off offset:96 -; GCN-NEXT: global_load_dwordx4 v[40:43], v[0:1], off offset:112 -; GCN-NEXT: global_load_dwordx4 v[44:47], v[0:1], off offset:128 -; GCN-NEXT: global_load_dwordx4 v[48:51], v[0:1], off offset:144 -; GCN-NEXT: global_load_dwordx4 v[52:55], v[0:1], off offset:160 -; GCN-NEXT: global_load_dwordx4 v[56:59], v[0:1], off offset:176 -; GCN-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:192 +; GCN-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 +; GCN-NEXT: global_load_dwordx4 v[16:19], v[0:1], off offset:32 +; GCN-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:48 +; GCN-NEXT: global_load_dwordx4 v[24:27], v[0:1], off offset:64 +; GCN-NEXT: global_load_dwordx4 v[28:31], v[0:1], off offset:80 +; GCN-NEXT: global_load_dwordx4 v[32:35], v[0:1], off offset:96 +; GCN-NEXT: global_load_dwordx4 v[36:39], v[0:1], off offset:112 +; GCN-NEXT: global_load_dwordx4 v[40:43], v[0:1], off offset:128 +; GCN-NEXT: global_load_dwordx4 v[44:47], v[0:1], off offset:144 +; GCN-NEXT: global_load_dwordx4 v[48:51], v[0:1], off offset:160 +; GCN-NEXT: global_load_dwordx4 v[52:55], v[0:1], off offset:176 +; GCN-NEXT: global_load_dwordx4 v[11:14], v[0:1], off offset:192 ; GCN-NEXT: s_add_i32 s32, s32, 0x10000 ; GCN-NEXT: s_add_i32 s32, s32, 0xffff0000 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill -; GCN-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:208 +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[11:14], v[0:1], off offset:208 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill -; GCN-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:224 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[56:59], v[0:1], off offset:224 ; GCN-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:240 ; GCN-NEXT: v_and_b32_e32 v0, 31, v2 ; GCN-NEXT: v_lshrrev_b32_e64 v2, 6, s33 @@ -582,50 +582,50 @@ ; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:260 ; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:264 ; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:268 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:272 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:276 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:280 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:284 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:288 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:292 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:296 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:300 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:304 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:308 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:312 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:316 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:320 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:324 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:328 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:332 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:336 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:340 -; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s33 offset:344 -; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:348 -; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:352 -; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:356 -; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:360 -; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:364 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:368 -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:372 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:376 -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:380 -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:384 -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:388 -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:392 -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:396 -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:400 -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:404 -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:408 -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:412 -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:416 -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:420 -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:424 -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:428 -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:432 -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:436 -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:440 -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:444 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:272 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:276 +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:280 +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:284 +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:288 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:292 +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:296 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:300 +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:304 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:308 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:312 +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:316 +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:320 +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:324 +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:328 +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:332 +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:336 +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:340 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:344 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:348 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:352 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:356 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s33 offset:360 +; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:364 +; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:368 +; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:372 +; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:376 +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:380 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:384 +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:388 +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:392 +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:396 +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:400 +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:404 +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:408 +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:412 +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:416 +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:420 +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:424 +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:428 +; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:432 +; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:436 +; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:440 +; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:444 ; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload @@ -676,10 +676,10 @@ ; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:468 ; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:472 ; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:476 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:480 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:484 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:488 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:492 +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:480 +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:484 +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:488 +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:492 ; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:496 ; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:500 ; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:504 diff --git a/llvm/test/CodeGen/AMDGPU/reserve-vgpr-for-sgpr-spill.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll rename from llvm/test/CodeGen/AMDGPU/reserve-vgpr-for-sgpr-spill.ll rename to llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll --- a/llvm/test/CodeGen/AMDGPU/reserve-vgpr-for-sgpr-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll @@ -5,7 +5,7 @@ ret void } -; GCN-LABEL: {{^}}reserve_vgpr_with_no_lower_vgpr_available: +; GCN-LABEL: {{^}}spill_sgpr_with_no_lower_vgpr_available: ; GCN: buffer_store_dword v255, off, s[0:3], s32 ; GCN: v_writelane_b32 v255, s33, 2 ; GCN: v_writelane_b32 v255, s30, 0 @@ -16,7 +16,7 @@ ; GCN: v_readlane_b32 s33, v255, 2 ; GCN: ; NumVgprs: 256 -define void @reserve_vgpr_with_no_lower_vgpr_available() #0 { +define void @spill_sgpr_with_no_lower_vgpr_available() #0 { %alloca = alloca i32, align 4, addrspace(5) store volatile i32 0, i32 addrspace(5)* %alloca @@ -51,7 +51,7 @@ ret void } -; GCN-LABEL: {{^}}reserve_lowest_available_vgpr: +; GCN-LABEL: {{^}}spill_to_lowest_available_vgpr: ; GCN: buffer_store_dword v254, off, s[0:3], s32 ; GCN: v_writelane_b32 v254, s33, 2 ; GCN: v_writelane_b32 v254, s30, 0 @@ -61,7 +61,7 @@ ; GCN: v_readlane_b32 s31, v254, 1 ; GCN: v_readlane_b32 s33, v254, 2 -define void @reserve_lowest_available_vgpr() #0 { +define void @spill_to_lowest_available_vgpr() #0 { %alloca = alloca i32, align 4, addrspace(5) store volatile i32 0, i32 addrspace(5)* %alloca @@ -96,14 +96,14 @@ ret void } -; GCN-LABEL: {{^}}reserve_vgpr_with_sgpr_spills: +; GCN-LABEL: {{^}}spill_sgpr_with_sgpr_uses: ; GCN-NOT: buffer_store_dword v255, off, s[0:3], s32 ; GCN: ; def s4 ; GCN: v_writelane_b32 v254, s4, 2 ; GCN: v_readlane_b32 s4, v254, 2 ; GCN: ; use s4 -define void @reserve_vgpr_with_sgpr_spills() #0 { +define void @spill_sgpr_with_sgpr_uses() #0 { %alloca = alloca i32, align 4, addrspace(5) store volatile i32 0, i32 addrspace(5)* %alloca @@ -147,12 +147,12 @@ ret void } -; GCN-LABEL: {{^}}reserve_vgpr_with_tail_call +; GCN-LABEL: {{^}}spill_sgpr_with_tail_call ; GCN-NOT: buffer_store_dword v255, off, s[0:3], s32 ; GCN-NOT: v_writelane ; GCN: s_setpc_b64 s[4:5] -define void @reserve_vgpr_with_tail_call() #0 { +define void @spill_sgpr_with_tail_call() #0 { %alloca = alloca i32, align 4, addrspace(5) store volatile i32 0, i32 addrspace(5)* %alloca @@ -187,17 +187,29 @@ ret void } -; GCN-LABEL: {{^}}reserve_vgpr_for_sgpr_spills_no_alloca: -; GCN: v_writelane_b32 v5, s34, 0 -; GCN: v_writelane_b32 v5, s35, 1 -; GCN: v_writelane_b32 v5, s36, 2 -; GCN: v_writelane_b32 v5, s37, 3 -; GCN: v_readlane_b32 s37, v5, 3 -; GCN: v_readlane_b32 s36, v5, 2 -; GCN: v_readlane_b32 s35, v5, 1 -; GCN: v_readlane_b32 s34, v5, 0 - -define void @reserve_vgpr_for_sgpr_spills_no_alloca(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { +; Special case where all registers are explicitly clobbered in the function and +; we have no VGPR to allocate for SGPR spills. We are forced to spill to memory. + +; GCN-LABEL: {{^}}spill_sgpr_no_free_vgpr: +; GCN: v_writelane_b32 [[A:v[0-9]+]], s34, 0 +; GCN: buffer_store_dword [[A]], off, s[0:3], s32 +; GCN: v_writelane_b32 [[B:v[0-9]+]], s35, 0 +; GCN: buffer_store_dword [[B]], off, s[0:3], s32 +; GCN: v_writelane_b32 [[C:v[0-9]+]], s36, 0 +; GCN: buffer_store_dword [[C]], off, s[0:3], s32 +; GCN: v_writelane_b32 [[D:v[0-9]+]], s37, 0 +; GCN: buffer_store_dword [[D]], off, s[0:3], s32 +; GCN: #ASMEND +; GCN: buffer_load_dword [[E:v[0-9]+]] +; GCN: v_readlane_b32 s37, [[E]], 0 +; GCN: buffer_load_dword [[F:v[0-9]+]] +; GCN: v_readlane_b32 s36, [[F]], 0 +; GCN: buffer_load_dword [[G:v[0-9]+]] +; GCN: v_readlane_b32 s35, [[G]], 0 +; GCN: buffer_load_dword [[H:v[0-9]+]] +; GCN: v_readlane_b32 s34, [[H]], 0 + +define void @spill_sgpr_no_free_vgpr(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { %a = load <4 x i32>, <4 x i32> addrspace(1)* %in call void asm sideeffect "", "~{v6},~{v7},~{v8},~{v9} @@ -234,4 +246,96 @@ ret void } +; If IPRA no-CSR optimization is enabled, we will not be able to allocate an +; SGPR for VGPR spills in the parent function since this child function uses all +; VGPRs. + +define internal void @child_function_ipra() #0 { + call void asm sideeffect "", + "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9} + ,~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19} + ,~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29} + ,~{v30},~{v31},~{v32},~{v33},~{v34},~{v35},~{v36},~{v37},~{v38},~{v39} + ,~{v40},~{v41},~{v42},~{v43},~{v44},~{v45},~{v46},~{v47},~{v48},~{v49} + ,~{v50},~{v51},~{v52},~{v53},~{v54},~{v55},~{v56},~{v57},~{v58},~{v59} + ,~{v60},~{v61},~{v62},~{v63},~{v64},~{v65},~{v66},~{v67},~{v68},~{v69} + ,~{v70},~{v71},~{v72},~{v73},~{v74},~{v75},~{v76},~{v77},~{v78},~{v79} + ,~{v80},~{v81},~{v82},~{v83},~{v84},~{v85},~{v86},~{v87},~{v88},~{v89} + ,~{v90},~{v91},~{v92},~{v93},~{v94},~{v95},~{v96},~{v97},~{v98},~{v99} + ,~{v100},~{v101},~{v102},~{v103},~{v104},~{v105},~{v106},~{v107},~{v108},~{v109} + ,~{v110},~{v111},~{v112},~{v113},~{v114},~{v115},~{v116},~{v117},~{v118},~{v119} + ,~{v120},~{v121},~{v122},~{v123},~{v124},~{v125},~{v126},~{v127},~{v128},~{v129} + ,~{v130},~{v131},~{v132},~{v133},~{v134},~{v135},~{v136},~{v137},~{v138},~{v139} + ,~{v140},~{v141},~{v142},~{v143},~{v144},~{v145},~{v146},~{v147},~{v148},~{v149} + ,~{v150},~{v151},~{v152},~{v153},~{v154},~{v155},~{v156},~{v157},~{v158},~{v159} + ,~{v160},~{v161},~{v162},~{v163},~{v164},~{v165},~{v166},~{v167},~{v168},~{v169} + ,~{v170},~{v171},~{v172},~{v173},~{v174},~{v175},~{v176},~{v177},~{v178},~{v179} + ,~{v180},~{v181},~{v182},~{v183},~{v184},~{v185},~{v186},~{v187},~{v188},~{v189} + ,~{v190},~{v191},~{v192},~{v193},~{v194},~{v195},~{v196},~{v197},~{v198},~{v199} + ,~{v200},~{v201},~{v202},~{v203},~{v204},~{v205},~{v206},~{v207},~{v208},~{v209} + ,~{v210},~{v211},~{v212},~{v213},~{v214},~{v215},~{v216},~{v217},~{v218},~{v219} + ,~{v220},~{v221},~{v222},~{v223},~{v224},~{v225},~{v226},~{v227},~{v228},~{v229} + ,~{v230},~{v231},~{v232},~{v233},~{v234},~{v235},~{v236},~{v237},~{v238},~{v239} + ,~{v240},~{v241},~{v242},~{v243},~{v244},~{v245},~{v246},~{v247},~{v248},~{v249} + ,~{v250},~{v251},~{v252},~{v253},~{v254},~{v255}" () #0 + ret void +} + +; GCN-LABEL: {{^}}spill_sgpr_no_free_vgpr_ipra: +; GCN: v_writelane_b32 v0, s30, 0 +; GCN: v_writelane_b32 v0, s31, 1 +; GCN: buffer_store_dword v0, off +; GCN: swappc +; GCN: buffer_load_dword v0, off +; GCN: v_readlane_b32 s30, v0, 0 +; GCN: v_readlane_b32 s31, v0, 1 +define void @spill_sgpr_no_free_vgpr_ipra() #0 { + call void @child_function_ipra() + ret void +} + +define internal void @child_function_ipra_tail_call() #0 { + call void asm sideeffect "", + "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9} + ,~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19} + ,~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29} + ,~{v30},~{v31},~{v32},~{v33},~{v34},~{v35},~{v36},~{v37},~{v38},~{v39} + ,~{v40},~{v41},~{v42},~{v43},~{v44},~{v45},~{v46},~{v47},~{v48},~{v49} + ,~{v50},~{v51},~{v52},~{v53},~{v54},~{v55},~{v56},~{v57},~{v58},~{v59} + ,~{v60},~{v61},~{v62},~{v63},~{v64},~{v65},~{v66},~{v67},~{v68},~{v69} + ,~{v70},~{v71},~{v72},~{v73},~{v74},~{v75},~{v76},~{v77},~{v78},~{v79} + ,~{v80},~{v81},~{v82},~{v83},~{v84},~{v85},~{v86},~{v87},~{v88},~{v89} + ,~{v90},~{v91},~{v92},~{v93},~{v94},~{v95},~{v96},~{v97},~{v98},~{v99} + ,~{v100},~{v101},~{v102},~{v103},~{v104},~{v105},~{v106},~{v107},~{v108},~{v109} + ,~{v110},~{v111},~{v112},~{v113},~{v114},~{v115},~{v116},~{v117},~{v118},~{v119} + ,~{v120},~{v121},~{v122},~{v123},~{v124},~{v125},~{v126},~{v127},~{v128},~{v129} + ,~{v130},~{v131},~{v132},~{v133},~{v134},~{v135},~{v136},~{v137},~{v138},~{v139} + ,~{v140},~{v141},~{v142},~{v143},~{v144},~{v145},~{v146},~{v147},~{v148},~{v149} + ,~{v150},~{v151},~{v152},~{v153},~{v154},~{v155},~{v156},~{v157},~{v158},~{v159} + ,~{v160},~{v161},~{v162},~{v163},~{v164},~{v165},~{v166},~{v167},~{v168},~{v169} + ,~{v170},~{v171},~{v172},~{v173},~{v174},~{v175},~{v176},~{v177},~{v178},~{v179} + ,~{v180},~{v181},~{v182},~{v183},~{v184},~{v185},~{v186},~{v187},~{v188},~{v189} + ,~{v190},~{v191},~{v192},~{v193},~{v194},~{v195},~{v196},~{v197},~{v198},~{v199} + ,~{v200},~{v201},~{v202},~{v203},~{v204},~{v205},~{v206},~{v207},~{v208},~{v209} + ,~{v210},~{v211},~{v212},~{v213},~{v214},~{v215},~{v216},~{v217},~{v218},~{v219} + ,~{v220},~{v221},~{v222},~{v223},~{v224},~{v225},~{v226},~{v227},~{v228},~{v229} + ,~{v230},~{v231},~{v232},~{v233},~{v234},~{v235},~{v236},~{v237},~{v238},~{v239} + ,~{v240},~{v241},~{v242},~{v243},~{v244},~{v245},~{v246},~{v247},~{v248},~{v249} + ,~{v250},~{v251},~{v252},~{v253},~{v254},~{v255}" () #0 + ret void +} + +; GCN-LABEL: {{^}}spill_sgpr_no_free_vgpr_ipra_tail_call: +; GCN-NOT: v_writelane_b32 +; GCN-NOT: buffer_store_dword +; GCN-NOT: swappc +; GCN-NOT: buffer_load_dword v0, off +; GCN-NOT: v_readlane_b32 +; GCN: setpc +define void @spill_sgpr_no_free_vgpr_ipra_tail_call() #0 { + tail call void @child_function_ipra_tail_call() + ret void +} + + attributes #0 = { nounwind noinline norecurse "amdgpu-flat-work-group-size"="1,256" }