Index: lib/Target/AMDGPU/AMDGPU.h =================================================================== --- lib/Target/AMDGPU/AMDGPU.h +++ lib/Target/AMDGPU/AMDGPU.h @@ -142,6 +142,9 @@ void initializeSILowerI1CopiesPass(PassRegistry &); extern char &SILowerI1CopiesID; +void initializeSILowerSGPRSpillsPass(PassRegistry &); +extern char &SILowerSGPRSpillsID; + void initializeSILoadStoreOptimizerPass(PassRegistry &); extern char &SILoadStoreOptimizerID; Index: lib/Target/AMDGPU/AMDGPUCallingConv.td =================================================================== --- lib/Target/AMDGPU/AMDGPUCallingConv.td +++ lib/Target/AMDGPU/AMDGPUCallingConv.td @@ -114,6 +114,16 @@ (sequence "SGPR%u", 32, 105) >; +// Just to get the regmask, not for calling convention purposes. +def CSR_AMDGPU_AllVGPRs : CalleeSavedRegs< + (sequence "VGPR%u", 0, 255) +>; + +// Just to get the regmask, not for calling convention purposes. +def CSR_AMDGPU_AllAllocatableSRegs : CalleeSavedRegs< + (add (sequence "SGPR%u", 0, 105), VCC_LO, VCC_HI) +>; + def CSR_AMDGPU_HighRegs : CalleeSavedRegs< (add CSR_AMDGPU_VGPRs_32_255, CSR_AMDGPU_SGPRs_32_105) >; Index: lib/Target/AMDGPU/AMDGPURegisterInfo.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPURegisterInfo.cpp +++ lib/Target/AMDGPU/AMDGPURegisterInfo.cpp @@ -89,3 +89,11 @@ return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() : FuncInfo->getStackPtrOffsetReg(); } + +const uint32_t *SIRegisterInfo::getAllVGPRRegMask() const { + return CSR_AMDGPU_AllVGPRs_RegMask; +} + +const uint32_t *SIRegisterInfo::getAllAllocatableSRegMask() const { + return CSR_AMDGPU_AllAllocatableSRegs_RegMask; +} Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -197,6 +197,7 @@ initializeAMDGPUDAGToDAGISelPass(*PR); initializeGCNDPPCombinePass(*PR); initializeSILowerI1CopiesPass(*PR); + initializeSILowerSGPRSpillsPass(*PR); initializeSIFixSGPRCopiesPass(*PR); initializeSIFixVGPRCopiesPass(*PR); initializeSIFixupVectorISelPass(*PR); @@ -958,6 +959,9 @@ if (getOptLevel() > CodeGenOpt::None) addPass(&SIOptimizeExecMaskingID); TargetPassConfig::addPostRegAlloc(); + + // Equivalent of PEI for SGPRs. + addPass(&SILowerSGPRSpillsID); } void GCNPassConfig::addPreSched2() { Index: lib/Target/AMDGPU/CMakeLists.txt =================================================================== --- lib/Target/AMDGPU/CMakeLists.txt +++ lib/Target/AMDGPU/CMakeLists.txt @@ -107,6 +107,7 @@ SILoadStoreOptimizer.cpp SILowerControlFlow.cpp SILowerI1Copies.cpp + SILowerSGPRSpills.cpp SIMachineFunctionInfo.cpp SIMachineScheduler.cpp SIMemoryLegalizer.cpp Index: lib/Target/AMDGPU/SIFrameLowering.h =================================================================== --- lib/Target/AMDGPU/SIFrameLowering.h +++ lib/Target/AMDGPU/SIFrameLowering.h @@ -36,6 +36,8 @@ void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS = nullptr) const override; + void determineCalleeSavesSGPR(MachineFunction &MF, BitVector &SavedRegs, + RegScavenger *RS = nullptr) const; bool isSupportedStackID(TargetStackID::Value ID) const override; Index: lib/Target/AMDGPU/SIFrameLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIFrameLowering.cpp +++ lib/Target/AMDGPU/SIFrameLowering.cpp @@ -720,11 +720,10 @@ // Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not // memory. -static bool allStackObjectsAreDeadOrSGPR(const MachineFrameInfo &MFI) { +static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) { for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); I != E; ++I) { - if (!MFI.isDeadObjectIndex(I) && - MFI.getStackID(I) != TargetStackID::SGPRSpill) + if (!MFI.isDeadObjectIndex(I)) return false; } @@ -752,37 +751,12 @@ const SIRegisterInfo &TRI = TII->getRegisterInfo(); SIMachineFunctionInfo *FuncInfo = MF.getInfo(); - if (TRI.spillSGPRToVGPR() && FuncInfo->hasSpilledSGPRs()) { - // Process all SGPR spills before frame offsets are finalized. Ideally SGPRs - // are spilled to VGPRs, in which case we can eliminate the stack usage. - // - // XXX - This operates under the assumption that only other SGPR spills are - // users of the frame index. I'm not 100% sure this is correct. The - // StackColoring pass has a comment saying a future improvement would be to - // merging of allocas with spill slots, but for now according to - // MachineFrameInfo isSpillSlot can't alias any other object. - for (MachineBasicBlock &MBB : MF) { - MachineBasicBlock::iterator Next; - for (auto I = MBB.begin(), E = MBB.end(); I != E; I = Next) { - MachineInstr &MI = *I; - Next = std::next(I); - - if (TII->isSGPRSpill(MI)) { - int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex(); - assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); - if (FuncInfo->allocateSGPRSpillToVGPR(MF, FI)) { - bool Spilled = TRI.eliminateSGPRToVGPRSpillFrameIndex(MI, FI, RS); - (void)Spilled; - assert(Spilled && "failed to spill SGPR to VGPR when allocated"); - } - } - } - } - } - FuncInfo->removeSGPRToVGPRFrameIndices(MFI); - if (!allStackObjectsAreDeadOrSGPR(MFI)) { + // FIXME: The other checks should be redundant with allStackObjectsAreDead, + // but currently hasNonSpillStackObjects is set only from source + // allocas. Stack temps produced from legalization are not counted currently. + if (!allStackObjectsAreDead(MFI)) { assert(RS && "RegScavenger required if spilling"); if (FuncInfo->isEntryFunction()) { @@ -799,13 +773,33 @@ } } -void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, +// Only report VGPRs to generic code. +void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, + BitVector &SavedRegs, RegScavenger *RS) const { TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); + const GCNSubtarget &ST = MF.getSubtarget(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + SavedRegs.clearBitsNotInMask(TRI->getAllVGPRRegMask()); + + // VGPRs used for SGPR spilling need to be specially inserted in the prolog. const SIMachineFunctionInfo *MFI = MF.getInfo(); + for (auto SSpill : MFI->getSGPRSpillVGPRs()) + SavedRegs.reset(SSpill.VGPR); +} + +void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF, + BitVector &SavedRegs, + RegScavenger *RS) const { + TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); + const SIMachineFunctionInfo *MFI = MF.getInfo(); + + const GCNSubtarget &ST = MF.getSubtarget(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); // The SP is specifically managed and we don't want extra spills of it. SavedRegs.reset(MFI->getStackPtrOffsetReg()); + SavedRegs.clearBitsInMask(TRI->getAllVGPRRegMask()); } MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr( Index: lib/Target/AMDGPU/SILowerSGPRSpills.cpp =================================================================== --- /dev/null +++ lib/Target/AMDGPU/SILowerSGPRSpills.cpp @@ -0,0 +1,292 @@ +//===-- SILowerSGPRSPills.cpp ---------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Handle SGPR spills. This pass takes the place of PrologEpilogInserter for all +// SGPR spills, so must insert CSR SGPR spills as well as expand them. +// +// This pass must never create new SGPR virtual registers. +// +// FIXME: Must stop RegScavenger spills in later passes. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/VirtRegMap.h" +#include "llvm/Target/TargetMachine.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-lower-sgpr-spills" + +using MBBVector = SmallVector; + +namespace { + +class SILowerSGPRSpills : public MachineFunctionPass { +private: + const SIRegisterInfo *TRI = nullptr; + const SIInstrInfo *TII = nullptr; + VirtRegMap *VRM = nullptr; + LiveIntervals *LIS = nullptr; + + // Save and Restore blocks of the current function. Typically there is a + // single save block, unless Windows EH funclets are involved. + MBBVector SaveBlocks; + MBBVector RestoreBlocks; + +public: + static char ID; + + SILowerSGPRSpills() : MachineFunctionPass(ID) {} + + void calculateSaveRestoreBlocks(MachineFunction &MF); + bool spillCalleeSavedRegs(MachineFunction &MF); + + bool runOnMachineFunction(MachineFunction &MF) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // end anonymous namespace + +char SILowerSGPRSpills::ID = 0; + +INITIALIZE_PASS_BEGIN(SILowerSGPRSpills, DEBUG_TYPE, + "SI lower SGPR spill instructions", false, false) +INITIALIZE_PASS_DEPENDENCY(VirtRegMap) +INITIALIZE_PASS_END(SILowerSGPRSpills, DEBUG_TYPE, + "SI lower SGPR spill instructions", false, false) + +char &llvm::SILowerSGPRSpillsID = SILowerSGPRSpills::ID; + +/// Insert restore code for the callee-saved registers used in the function. +static void insertCSRSaves(MachineBasicBlock &SaveBlock, + ArrayRef CSI, + LiveIntervals *LIS) { + MachineFunction &MF = *SaveBlock.getParent(); + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + + MachineBasicBlock::iterator I = SaveBlock.begin(); + if (!TFI->spillCalleeSavedRegisters(SaveBlock, I, CSI, TRI)) { + for (const CalleeSavedInfo &CS : CSI) { + // Insert the spill to the stack frame. + unsigned Reg = CS.getReg(); + + MachineInstrSpan MIS(I); + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + + TII.storeRegToStackSlot(SaveBlock, I, Reg, true, CS.getFrameIdx(), RC, + TRI); + + if (LIS) { + assert(std::distance(MIS.begin(), I) == 1); + MachineInstr &Inst = *std::prev(I); + + LIS->InsertMachineInstrInMaps(Inst); + LIS->removeAllRegUnitsForPhysReg(Reg); + } + } + } +} + +/// Insert restore code for the callee-saved registers used in the function. +static void insertCSRRestores(MachineBasicBlock &RestoreBlock, + std::vector &CSI, + LiveIntervals *LIS) { + MachineFunction &MF = *RestoreBlock.getParent(); + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + + // Restore all registers immediately before the return and any + // terminators that precede it. + MachineBasicBlock::iterator I = RestoreBlock.getFirstTerminator(); + + // FIXME: Just emit the readlane/writelane directly + if (!TFI->restoreCalleeSavedRegisters(RestoreBlock, I, CSI, TRI)) { + for (const CalleeSavedInfo &CI : reverse(CSI)) { + unsigned Reg = CI.getReg(); + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + + TII.loadRegFromStackSlot(RestoreBlock, I, Reg, CI.getFrameIdx(), RC, TRI); + assert(I != RestoreBlock.begin() && + "loadRegFromStackSlot didn't insert any code!"); + // Insert in reverse order. loadRegFromStackSlot can insert + // multiple instructions. + + if (LIS) { + MachineInstr &Inst = *std::prev(I); + LIS->InsertMachineInstrInMaps(Inst); + LIS->removeAllRegUnitsForPhysReg(Reg); + } + } + } +} + +/// Compute the sets of entry and return blocks for saving and restoring +/// callee-saved registers, and placing prolog and epilog code. +void SILowerSGPRSpills::calculateSaveRestoreBlocks(MachineFunction &MF) { + const MachineFrameInfo &MFI = MF.getFrameInfo(); + + // Even when we do not change any CSR, we still want to insert the + // prologue and epilogue of the function. + // So set the save points for those. + + // Use the points found by shrink-wrapping, if any. + if (MFI.getSavePoint()) { + SaveBlocks.push_back(MFI.getSavePoint()); + assert(MFI.getRestorePoint() && "Both restore and save must be set"); + MachineBasicBlock *RestoreBlock = MFI.getRestorePoint(); + // If RestoreBlock does not have any successor and is not a return block + // then the end point is unreachable and we do not need to insert any + // epilogue. + if (!RestoreBlock->succ_empty() || RestoreBlock->isReturnBlock()) + RestoreBlocks.push_back(RestoreBlock); + return; + } + + // Save refs to entry and return blocks. + SaveBlocks.push_back(&MF.front()); + for (MachineBasicBlock &MBB : MF) { + if (MBB.isEHFuncletEntry()) + SaveBlocks.push_back(&MBB); + if (MBB.isReturnBlock()) + RestoreBlocks.push_back(&MBB); + } +} + +bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + const Function &F = MF.getFunction(); + const GCNSubtarget &ST = MF.getSubtarget(); + const SIFrameLowering *TFI = ST.getFrameLowering(); + MachineFrameInfo &MFI = MF.getFrameInfo(); + RegScavenger *RS = nullptr; + + // Determine which of the registers in the callee save list should be saved. + BitVector SavedRegs; + TFI->determineCalleeSavesSGPR(MF, SavedRegs, RS); + + // Add the code to save and restore the callee saved registers. + if (!F.hasFnAttribute(Attribute::Naked)) { + // FIXME: This is a lie. The CalleeSavedInfo is incomplete, but this is + // necessary for verifier liveness checks. + MFI.setCalleeSavedInfoValid(true); + + std::vector CSI; + const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs(); + + for (unsigned I = 0; CSRegs[I]; ++I) { + unsigned Reg = CSRegs[I]; + if (SavedRegs.test(Reg)) { + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + int JunkFI = MFI.CreateStackObject(TRI->getSpillSize(*RC), + TRI->getSpillAlignment(*RC), + true); + + CSI.push_back(CalleeSavedInfo(Reg, JunkFI)); + } + } + + if (!CSI.empty()) { + for (MachineBasicBlock *SaveBlock : SaveBlocks) + insertCSRSaves(*SaveBlock, CSI, LIS); + + for (MachineBasicBlock *RestoreBlock : RestoreBlocks) + insertCSRRestores(*RestoreBlock, CSI, LIS); + return true; + } + } + + return false; +} + +bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { + const GCNSubtarget &ST = MF.getSubtarget(); + TII = ST.getInstrInfo(); + TRI = &TII->getRegisterInfo(); + + VRM = getAnalysisIfAvailable(); + + bool AllSGPRSpilledToVGPRs = false; + + assert(SaveBlocks.empty() && RestoreBlocks.empty()); + + // First, expose any CSR SGPR spills. This is mostly the same as what PEI + // does, but somewhat simpler. + calculateSaveRestoreBlocks(MF); + bool HasCSRs = spillCalleeSavedRegs(MF); + + MachineFrameInfo &MFI = MF.getFrameInfo(); + if (!MFI.hasStackObjects() && !HasCSRs) { + SaveBlocks.clear(); + RestoreBlocks.clear(); + return false; + } + + SIMachineFunctionInfo *FuncInfo = MF.getInfo(); + bool MadeChange = false; + + if (TRI->spillSGPRToVGPR() && (HasCSRs || FuncInfo->hasSpilledSGPRs())) { + AllSGPRSpilledToVGPRs = true; + + // Process all SGPR spills before frame offsets are finalized. Ideally SGPRs + // are spilled to VGPRs, in which case we can eliminate the stack usage. + // + // This operates under the assumption that only other SGPR spills are users + // of the frame index. + for (MachineBasicBlock &MBB : MF) { + MachineBasicBlock::iterator Next; + for (auto I = MBB.begin(), E = MBB.end(); I != E; I = Next) { + MachineInstr &MI = *I; + Next = std::next(I); + + if (!TII->isSGPRSpill(MI)) + continue; + + int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex(); + assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); + if (FuncInfo->allocateSGPRSpillToVGPR(MF, FI)) { + bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(MI, FI, nullptr); + (void)Spilled; + assert(Spilled && "failed to spill SGPR to VGPR when allocated"); + } else + AllSGPRSpilledToVGPRs = false; + } + } + + for (MachineBasicBlock &MBB : MF) { + for (auto SSpill : FuncInfo->getSGPRSpillVGPRs()) + MBB.addLiveIn(SSpill.VGPR); + MBB.sortUniqueLiveIns(); + } + + FuncInfo->removeSGPRToVGPRFrameIndices(MFI); + MadeChange = true; + } + + SaveBlocks.clear(); + RestoreBlocks.clear(); + + return MadeChange; +} Index: lib/Target/AMDGPU/SIRegisterInfo.h =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.h +++ lib/Target/AMDGPU/SIRegisterInfo.h @@ -264,6 +264,9 @@ MachineRegisterInfo &MRI, LiveIntervals *LIS) const; + const uint32_t *getAllVGPRRegMask() const; + const uint32_t *getAllAllocatableSRegMask() const; + private: void buildSpillLoadStore(MachineBasicBlock::iterator MI, unsigned LoadStoreOp,