Index: lib/Target/AMDGPU/SIFrameLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIFrameLowering.cpp +++ lib/Target/AMDGPU/SIFrameLowering.cpp @@ -368,6 +368,16 @@ } +static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) { + for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); + I != E; ++I) { + if (!MFI.isDeadObjectIndex(I)) + return false; + } + + return true; +} + void SIFrameLowering::processFunctionBeforeFrameFinalized( MachineFunction &MF, RegScavenger *RS) const { @@ -376,45 +386,55 @@ if (!MFI.hasStackObjects()) return; - assert(RS && "RegScavenger required if spilling"); - int ScavengeFI = MFI.CreateStackObject( - AMDGPU::SGPR_32RegClass.getSize(), - AMDGPU::SGPR_32RegClass.getAlignment(), false); - RS->addScavengingFrameIndex(ScavengeFI); - const SISubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo &TRI = TII->getRegisterInfo(); - if (!TRI.spillSGPRToVGPR()) - return; - SIMachineFunctionInfo *FuncInfo = MF.getInfo(); - if (!FuncInfo->hasSpilledSGPRs()) - return; - - // Process all SGPR spills before frame offsets are finalized. Ideally SGPRs - // are spilled to VGPRs, in which case we can eliminate the stack usage. - // - // XXX - This operates under the assumption that only other SGPR spills are - // users of the frame index. I'm not 100% sure this is correct. The - // StackColoring pass has a comment saying a future improvement would be to - // merging of allocas with spill slots, but for now according to - // MachineFrameInfo isSpillSlot can't alias any other object. - for (MachineBasicBlock &MBB : MF) { - MachineBasicBlock::iterator Next; - for (auto I = MBB.begin(), E = MBB.end(); I != E; I = Next) { - MachineInstr &MI = *I; - Next = std::next(I); - - if (TII->isSGPRSpill(MI)) { - int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex(); - if (FuncInfo->allocateSGPRSpillToVGPR(MF, FI)) - TRI.eliminateSGPRToVGPRSpillFrameIndex(MI, FI, RS); + bool AllSGPRSpilledToVGPRs = false; + + if (TRI.spillSGPRToVGPR() && FuncInfo->hasSpilledSGPRs()) { + AllSGPRSpilledToVGPRs = true; + + // Process all SGPR spills before frame offsets are finalized. Ideally SGPRs + // are spilled to VGPRs, in which case we can eliminate the stack usage. + // + // XXX - This operates under the assumption that only other SGPR spills are + // users of the frame index. I'm not 100% sure this is correct. The + // StackColoring pass has a comment saying a future improvement would be to + // merging of allocas with spill slots, but for now according to + // MachineFrameInfo isSpillSlot can't alias any other object. + for (MachineBasicBlock &MBB : MF) { + MachineBasicBlock::iterator Next; + for (auto I = MBB.begin(), E = MBB.end(); I != E; I = Next) { + MachineInstr &MI = *I; + Next = std::next(I); + + if (TII->isSGPRSpill(MI)) { + int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex(); + if (FuncInfo->allocateSGPRSpillToVGPR(MF, FI)) { + bool Spilled = TRI.eliminateSGPRToVGPRSpillFrameIndex(MI, FI, RS); + (void)Spilled; + assert(Spilled && "failed to spill SGPR to VGPR when allocated"); + } else + AllSGPRSpilledToVGPRs = false; + } } } + + FuncInfo->removeSGPRToVGPRFrameIndices(MFI); } - FuncInfo->removeSGPRToVGPRFrameIndices(MFI); + // FIXME: The other checks should be redundant with allStackObjectsAreDead, + // but currently hasNonSpillStackObjects is set only from source + // allocas. Stack temps produced from legalization are not counted currently. + if (FuncInfo->hasNonSpillStackObjects() || FuncInfo->hasSpilledVGPRs() || + !AllSGPRSpilledToVGPRs || !allStackObjectsAreDead(MFI)) { + assert(RS && "RegScavenger required if spilling"); + int ScavengeFI = MFI.CreateStackObject( + AMDGPU::SGPR_32RegClass.getSize(), + AMDGPU::SGPR_32RegClass.getAlignment(), false); + RS->addScavengingFrameIndex(ScavengeFI); + } } void SIFrameLowering::emitDebuggerPrologue(MachineFunction &MF, Index: test/CodeGen/AMDGPU/si-sgpr-spill.ll =================================================================== --- test/CodeGen/AMDGPU/si-sgpr-spill.ll +++ test/CodeGen/AMDGPU/si-sgpr-spill.ll @@ -19,13 +19,12 @@ ; CHECK: s_mov_b32 m0 ; Make sure scratch space isn't being used for SGPR->VGPR spills -; FIXME: Seem to be leaving behind unused emergency slot. ; Writing to M0 from an SMRD instruction will hang the GPU. ; CHECK-NOT: s_buffer_load_dword m0 ; CHECK: s_endpgm -; TOVGPR: ScratchSize: 4{{$}} +; TOVGPR: ScratchSize: 0{{$}} define amdgpu_ps void @main([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) { main_body: %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg, i64 0, i32 0 @@ -676,7 +675,7 @@ ; CHECK-LABEL: {{^}}main1: ; CHECK: s_endpgm -; TOVGPR: ScratchSize: 4{{$}} +; TOVGPR: ScratchSize: 0{{$}} define amdgpu_ps void @main1([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) { main_body: %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg, i64 0, i32 0