Index: include/llvm/CodeGen/Passes.h =================================================================== --- include/llvm/CodeGen/Passes.h +++ include/llvm/CodeGen/Passes.h @@ -15,6 +15,8 @@ #ifndef LLVM_CODEGEN_PASSES_H #define LLVM_CODEGEN_PASSES_H +#include "llvm/CodeGen/RegAllocCommon.h" + #include #include @@ -162,16 +164,20 @@ /// possible. It is best suited for debug code where live ranges are short. /// FunctionPass *createFastRegisterAllocator(); + FunctionPass *createFastRegisterAllocator(RegClassFilterFunc F, + bool ClearVirtRegs); /// BasicRegisterAllocation Pass - This pass implements a degenerate global /// register allocator using the basic regalloc framework. /// FunctionPass *createBasicRegisterAllocator(); + FunctionPass *createBasicRegisterAllocator(RegClassFilterFunc F); /// Greedy register allocation pass - This pass implements a global register /// allocator for optimized builds. /// FunctionPass *createGreedyRegisterAllocator(); + FunctionPass *createGreedyRegisterAllocator(RegClassFilterFunc F); /// PBQPRegisterAllocation Pass - This pass implements the Partitioned Boolean /// Quadratic Prograaming (PBQP) based register allocator. Index: include/llvm/CodeGen/RegAllocCommon.h =================================================================== --- /dev/null +++ include/llvm/CodeGen/RegAllocCommon.h @@ -0,0 +1,32 @@ +//===- RegAllocCommon.h - Utilities shared between allocators ---*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CODEGEN_REGALLOCCOMMON_H +#define LLVM_CODEGEN_REGALLOCCOMMON_H + +#include + +namespace llvm { + +class TargetRegisterClass; +class TargetRegisterInfo; + +typedef std::function RegClassFilterFunc; + +/// Default register class filter function for register allocation. All virtual +/// registers should be allocated. +static inline bool allocateAllRegClasses(const TargetRegisterInfo &, + const TargetRegisterClass &) { + return true; +} + +} + +#endif // LLVM_CODEGEN_REGALLOCCOMMON_H Index: include/llvm/CodeGen/RegAllocRegistry.h =================================================================== --- include/llvm/CodeGen/RegAllocRegistry.h +++ include/llvm/CodeGen/RegAllocRegistry.h @@ -15,6 +15,7 @@ #ifndef LLVM_CODEGEN_REGALLOCREGISTRY_H #define LLVM_CODEGEN_REGALLOCREGISTRY_H +#include "llvm/CodeGen/RegAllocCommon.h" #include "llvm/CodeGen/MachinePassRegistry.h" namespace llvm { Index: include/llvm/CodeGen/TargetFrameLowering.h =================================================================== --- include/llvm/CodeGen/TargetFrameLowering.h +++ include/llvm/CodeGen/TargetFrameLowering.h @@ -23,6 +23,7 @@ class CalleeSavedInfo; class MachineFunction; class RegScavenger; + class VirtRegMap; /// Information about stack frame layout on the target. It holds the direction /// of stack growth, the known stack alignment on entry to each function, and Index: lib/CodeGen/LiveIntervals.cpp =================================================================== --- lib/CodeGen/LiveIntervals.cpp +++ lib/CodeGen/LiveIntervals.cpp @@ -696,10 +696,15 @@ if (LI.empty()) continue; + // Target may have not allocated this yet. + unsigned PhysReg = VRM->getPhys(Reg); + if (PhysReg == 0) + continue; + // Find the regunit intervals for the assigned register. They may overlap // the virtual register live range, cancelling any kills. RU.clear(); - for (MCRegUnitIterator Unit(VRM->getPhys(Reg), TRI); Unit.isValid(); + for (MCRegUnitIterator Unit(PhysReg, TRI); Unit.isValid(); ++Unit) { const LiveRange &RURange = getRegUnit(*Unit); if (RURange.empty()) Index: lib/CodeGen/RegAllocBase.h =================================================================== --- lib/CodeGen/RegAllocBase.h +++ lib/CodeGen/RegAllocBase.h @@ -38,6 +38,7 @@ #define LLVM_LIB_CODEGEN_REGALLOCBASE_H #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/CodeGen/RegAllocCommon.h" #include "llvm/CodeGen/RegisterClassInfo.h" namespace llvm { @@ -68,6 +69,7 @@ LiveIntervals *LIS = nullptr; LiveRegMatrix *Matrix = nullptr; RegisterClassInfo RegClassInfo; + RegClassFilterFunc ShouldAllocateClass; /// Inst which is a def of an original reg and whose defs are already all /// dead after remat is saved in DeadRemats. The deletion of such inst is @@ -75,7 +77,9 @@ /// always available for the remat of all the siblings of the original reg. SmallPtrSet DeadRemats; - RegAllocBase() = default; + RegAllocBase(RegClassFilterFunc F = allocateAllRegClasses) : + ShouldAllocateClass(F) {} + virtual ~RegAllocBase() = default; // A RegAlloc pass should call this before allocatePhysRegs. @@ -93,7 +97,10 @@ virtual Spiller &spiller() = 0; /// enqueue - Add VirtReg to the priority queue of unassigned registers. - virtual void enqueue(LiveInterval *LI) = 0; + virtual void enqueueImpl(LiveInterval *LI) = 0; + + /// enqueue - Add VirtReg to the priority queue of unassigned registers. + void enqueue(LiveInterval *LI); /// dequeue - Return the next unassigned register, or NULL. virtual LiveInterval *dequeue() = 0; Index: lib/CodeGen/RegAllocBase.cpp =================================================================== --- lib/CodeGen/RegAllocBase.cpp +++ lib/CodeGen/RegAllocBase.cpp @@ -167,3 +167,19 @@ } DeadRemats.clear(); } + +void RegAllocBase::enqueue(LiveInterval *LI) { + const unsigned Reg = LI->reg; + + assert(TargetRegisterInfo::isVirtualRegister(Reg) && + "Can only enqueue virtual registers"); + + const TargetRegisterClass &RC = *MRI->getRegClass(Reg); + if (!ShouldAllocateClass(*TRI, RC)) + return; + + if (VRM->hasPhys(Reg)) + return; + + enqueueImpl(LI); +} Index: lib/CodeGen/RegAllocBasic.cpp =================================================================== --- lib/CodeGen/RegAllocBasic.cpp +++ lib/CodeGen/RegAllocBasic.cpp @@ -77,7 +77,7 @@ void LRE_WillShrinkVirtReg(unsigned) override; public: - RABasic(); + RABasic(RegClassFilterFunc F = allocateAllRegClasses); /// Return the pass name. StringRef getPassName() const override { return "Basic Register Allocator"; } @@ -89,7 +89,7 @@ Spiller &spiller() override { return *SpillerInstance; } - void enqueue(LiveInterval *LI) override { + void enqueueImpl(LiveInterval *LI) override { Queue.push(LI); } @@ -167,7 +167,9 @@ enqueue(&LI); } -RABasic::RABasic(): MachineFunctionPass(ID) { +RABasic::RABasic(RegClassFilterFunc F): + MachineFunctionPass(ID), + RegAllocBase(F) { } void RABasic::getAnalysisUsage(AnalysisUsage &AU) const { @@ -328,7 +330,10 @@ return true; } -FunctionPass* llvm::createBasicRegisterAllocator() -{ +FunctionPass* llvm::createBasicRegisterAllocator() { return new RABasic(); } + +FunctionPass* llvm::createBasicRegisterAllocator(RegClassFilterFunc F) { + return new RABasic(F); +} Index: lib/CodeGen/RegAllocFast.cpp =================================================================== --- lib/CodeGen/RegAllocFast.cpp +++ lib/CodeGen/RegAllocFast.cpp @@ -28,6 +28,7 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegAllocCommon.h" #include "llvm/CodeGen/RegAllocRegistry.h" #include "llvm/CodeGen/RegisterClassInfo.h" #include "llvm/CodeGen/TargetInstrInfo.h" @@ -69,7 +70,13 @@ public: static char ID; - RegAllocFast() : MachineFunctionPass(ID), StackSlotForVirtReg(-1) {} + RegAllocFast(RegClassFilterFunc F = allocateAllRegClasses, + bool ClearVirtRegs_ = true) : + MachineFunctionPass(ID), + ShouldAllocateClass(F), + StackSlotForVirtReg(-1), + ClearVirtRegs(ClearVirtRegs_) { + } private: MachineFrameInfo *MFI; @@ -77,6 +84,7 @@ const TargetRegisterInfo *TRI; const TargetInstrInfo *TII; RegisterClassInfo RegClassInfo; + RegClassFilterFunc ShouldAllocateClass; /// Basic block currently being allocated. MachineBasicBlock *MBB; @@ -84,6 +92,8 @@ /// Maps virtual regs to the frame index where these values are spilled. IndexedMap StackSlotForVirtReg; + bool ClearVirtRegs; + /// Everything we know about a live virtual register. struct LiveReg { MachineInstr *LastUse = nullptr; ///< Last instr to use reg. @@ -199,8 +209,12 @@ } MachineFunctionProperties getSetProperties() const override { - return MachineFunctionProperties().set( + if (ClearVirtRegs) { + return MachineFunctionProperties().set( MachineFunctionProperties::Property::NoVRegs); + } + + return MachineFunctionProperties(); } private: @@ -1316,9 +1330,11 @@ for (MachineBasicBlock &MBB : MF) allocateBasicBlock(MBB); - // All machine operands and other references to virtual registers have been - // replaced. Remove the virtual registers. - MRI->clearVirtRegs(); + if (ClearVirtRegs) { + // All machine operands and other references to virtual registers have been + // replaced. Remove the virtual registers. + MRI->clearVirtRegs(); + } StackSlotForVirtReg.clear(); LiveDbgValueMap.clear(); @@ -1328,3 +1344,9 @@ FunctionPass *llvm::createFastRegisterAllocator() { return new RegAllocFast(); } + +FunctionPass *llvm::createFastRegisterAllocator( + std::function Ftor, bool ClearVirtRegs) { + return new RegAllocFast(Ftor, ClearVirtRegs); +} Index: lib/CodeGen/RegAllocGreedy.cpp =================================================================== --- lib/CodeGen/RegAllocGreedy.cpp +++ lib/CodeGen/RegAllocGreedy.cpp @@ -413,7 +413,7 @@ SmallSetVector SetOfBrokenHints; public: - RAGreedy(); + RAGreedy(RegClassFilterFunc F = allocateAllRegClasses); /// Return the pass name. StringRef getPassName() const override { return "Greedy Register Allocator"; } @@ -422,7 +422,7 @@ void getAnalysisUsage(AnalysisUsage &AU) const override; void releaseMemory() override; Spiller &spiller() override { return *SpillerInstance; } - void enqueue(LiveInterval *LI) override; + void enqueueImpl(LiveInterval *LI) override; LiveInterval *dequeue() override; unsigned selectOrSplit(LiveInterval&, SmallVectorImpl&) override; void aboutToRemoveInterval(LiveInterval &) override; @@ -595,7 +595,22 @@ return new RAGreedy(); } -RAGreedy::RAGreedy(): MachineFunctionPass(ID) { +namespace llvm { +FunctionPass* createGreedyRegisterAllocator( + std::function Ftor); + +} + +FunctionPass* llvm::createGreedyRegisterAllocator( + std::function Ftor) { + return new RAGreedy(Ftor); +} + +RAGreedy::RAGreedy(RegClassFilterFunc F): + MachineFunctionPass(ID), + RegAllocBase(F) { } void RAGreedy::getAnalysisUsage(AnalysisUsage &AU) const { @@ -652,7 +667,7 @@ // Register is assigned, put it back on the queue for reassignment. LiveInterval &LI = LIS->getInterval(VirtReg); Matrix->unassign(LI); - enqueue(&LI); + RegAllocBase::enqueue(&LI); } void RAGreedy::LRE_DidCloneVirtReg(unsigned New, unsigned Old) { @@ -675,7 +690,7 @@ GlobalCand.clear(); } -void RAGreedy::enqueue(LiveInterval *LI) { enqueue(Queue, LI); } +void RAGreedy::enqueueImpl(LiveInterval *LI) { enqueue(Queue, LI); } void RAGreedy::enqueue(PQueue &CurQueue, LiveInterval *LI) { // Prioritize live ranges by size, assigning larger ranges first. @@ -2919,7 +2934,12 @@ if (TargetRegisterInfo::isPhysicalRegister(Reg)) continue; - assert(VRM->hasPhys(Reg) && "We have unallocated variable!!"); + // This may be a skipped class + if (!VRM->hasPhys(Reg)) { + assert(!ShouldAllocateClass(*TRI, *MRI->getRegClass(Reg)) && + "We have an unallocated variable which should have been handled"); + continue; + } // Get the live interval mapped with this virtual register to be able // to check for the interference with the new color. Index: lib/Target/AMDGPU/AMDGPU.h =================================================================== --- lib/Target/AMDGPU/AMDGPU.h +++ lib/Target/AMDGPU/AMDGPU.h @@ -134,6 +134,9 @@ void initializeSILowerI1CopiesPass(PassRegistry &); extern char &SILowerI1CopiesID; +void initializeSILowerSGPRSpillsPass(PassRegistry &); +extern char &SILowerSGPRSpillsID; + void initializeSILoadStoreOptimizerPass(PassRegistry &); extern char &SILoadStoreOptimizerID; Index: lib/Target/AMDGPU/AMDGPUCallingConv.td =================================================================== --- lib/Target/AMDGPU/AMDGPUCallingConv.td +++ lib/Target/AMDGPU/AMDGPUCallingConv.td @@ -97,6 +97,11 @@ (sequence "SGPR%u", 32, 103) >; +// Just to get the regmask, not for calling convention purposes. +def CSR_AMDGPU_AllVGPRs : CalleeSavedRegs< + (sequence "VGPR%u", 0, 255) +>; + def CSR_AMDGPU_HighRegs : CalleeSavedRegs< (add CSR_AMDGPU_VGPRs_32_255, CSR_AMDGPU_SGPRs_32_103) >; Index: lib/Target/AMDGPU/AMDGPURegisterInfo.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPURegisterInfo.cpp +++ lib/Target/AMDGPU/AMDGPURegisterInfo.cpp @@ -87,3 +87,7 @@ const SIMachineFunctionInfo *FuncInfo = MF.getInfo(); return FuncInfo->getFrameOffsetReg(); } + +const uint32_t *SIRegisterInfo::getAllVGPRRegMask() const { + return CSR_AMDGPU_AllVGPRs_RegMask; +} Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -31,6 +31,7 @@ #include "llvm/CodeGen/GlobalISel/Legalizer.h" #include "llvm/CodeGen/GlobalISel/RegBankSelect.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/RegAllocRegistry.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/Function.h" @@ -51,6 +52,115 @@ using namespace llvm; +namespace { +class SGPRRegisterRegAlloc : public RegisterRegAllocBase { +public: + SGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C) + : RegisterRegAllocBase(N, D, C) {} +}; + +class VGPRRegisterRegAlloc : public RegisterRegAllocBase { +public: + VGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C) + : RegisterRegAllocBase(N, D, C) {} +}; + +static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI, + const TargetRegisterClass &RC) { + return static_cast(TRI).isSGPRClass(&RC); +} + +static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI, + const TargetRegisterClass &RC) { + return !static_cast(TRI).isSGPRClass(&RC); +} + + +/// -{sgpr|vgpr}-regalloc=... command line option. +static FunctionPass *useDefaultRegisterAllocator() { return nullptr; } + +/// A dummy default pass factory indicates whether the register allocator is +/// overridden on the command line. +static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag; +static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag; + +static SGPRRegisterRegAlloc +defaultSGPRRegAlloc("default", + "pick SGPR register allocator based on -O option", + useDefaultRegisterAllocator); + +static cl::opt> +SGPRRegAlloc("sgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator), + cl::desc("Register allocator to use for SGPRs")); + +static cl::opt> +VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator), + cl::desc("Register allocator to use for VGPRs")); + + +static void initializeDefaultSGPRRegisterAllocatorOnce() { + RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault(); + + if (!Ctor) { + Ctor = SGPRRegAlloc; + SGPRRegisterRegAlloc::setDefault(SGPRRegAlloc); + } +} + +static void initializeDefaultVGPRRegisterAllocatorOnce() { + RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault(); + + if (!Ctor) { + Ctor = VGPRRegAlloc; + VGPRRegisterRegAlloc::setDefault(VGPRRegAlloc); + } +} + +static FunctionPass *createBasicSGPRRegisterAllocator() { + return createBasicRegisterAllocator(onlyAllocateSGPRs); +} + +static FunctionPass *createGreedySGPRRegisterAllocator() { + return createGreedyRegisterAllocator(onlyAllocateSGPRs); +} + +static FunctionPass *createFastSGPRRegisterAllocator() { + return createFastRegisterAllocator(onlyAllocateSGPRs, false); +} + +static FunctionPass *createBasicVGPRRegisterAllocator() { + return createBasicRegisterAllocator(onlyAllocateVGPRs); +} + +static FunctionPass *createGreedyVGPRRegisterAllocator() { + return createGreedyRegisterAllocator(onlyAllocateVGPRs); +} + +static FunctionPass *createFastVGPRRegisterAllocator() { + return createFastRegisterAllocator(onlyAllocateVGPRs, true); +} + +static SGPRRegisterRegAlloc basicRegAllocSGPR( + "basic", "basic register allocator", createBasicSGPRRegisterAllocator); +static SGPRRegisterRegAlloc greedyRegAllocSGPR( + "greedy", "greedy register allocator", createGreedySGPRRegisterAllocator); + +static SGPRRegisterRegAlloc fastRegAllocSGPR( + "fast", "fast register allocator", createFastSGPRRegisterAllocator); + + +static VGPRRegisterRegAlloc basicRegAllocVGPR( + "basic", "basic register allocator", createBasicVGPRRegisterAllocator); +static VGPRRegisterRegAlloc greedyRegAllocVGPR( + "greedy", "greedy register allocator", createGreedyVGPRRegisterAllocator); + +static VGPRRegisterRegAlloc fastRegAllocVGPR( + "fast", "fast register allocator", createFastVGPRRegisterAllocator); +} + + static cl::opt EnableR600StructurizeCFG( "r600-ir-structurize", cl::desc("Use StructurizeCFG IR pass"), @@ -172,6 +282,7 @@ initializeAMDGPUDAGToDAGISelPass(*PR); initializeGCNDPPCombinePass(*PR); initializeSILowerI1CopiesPass(*PR); + initializeSILowerSGPRSpillsPass(*PR); initializeSIFixSGPRCopiesPass(*PR); initializeSIFixVGPRCopiesPass(*PR); initializeSIFixupVectorISelPass(*PR); @@ -574,6 +685,14 @@ bool addGlobalInstructionSelect() override; void addFastRegAlloc() override; void addOptimizedRegAlloc() override; + + FunctionPass *createSGPRAllocPass(bool Optimized); + FunctionPass *createVGPRAllocPass(bool Optimized); + FunctionPass *createRegAllocPass(bool Optimized) override; + + bool addRegAssignmentFast() override; + bool addRegAssignmentOptimized() override; + void addPreRegAlloc() override; void addPostRegAlloc() override; void addPreSched2() override; @@ -888,6 +1007,83 @@ TargetPassConfig::addOptimizedRegAlloc(); } +FunctionPass *GCNPassConfig::createSGPRAllocPass(bool Optimized) { + // Initialize the global default. + llvm::call_once(InitializeDefaultSGPRRegisterAllocatorFlag, + initializeDefaultSGPRRegisterAllocatorOnce); + + RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault(); + if (Ctor != useDefaultRegisterAllocator) + return Ctor(); + + if (Optimized) + return createGreedyRegisterAllocator(onlyAllocateSGPRs); + + return createFastRegisterAllocator(onlyAllocateSGPRs, false); +} + +FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) { + // Initialize the global default. + llvm::call_once(InitializeDefaultVGPRRegisterAllocatorFlag, + initializeDefaultVGPRRegisterAllocatorOnce); + + RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault(); + if (Ctor != useDefaultRegisterAllocator) + return Ctor(); + + if (Optimized) + return createGreedyVGPRRegisterAllocator(); + + return createFastVGPRRegisterAllocator(); +} + +FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) { + llvm_unreachable("should not be used"); +} + +static const char RegAllocOptNotSupportedMessage[] = + "-regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc"; + +bool GCNPassConfig::addRegAssignmentFast() { + if (!usingDefaultRegAlloc()) + report_fatal_error(RegAllocOptNotSupportedMessage); + + addPass(createSGPRAllocPass(false)); + + // Equivalent of PEI for SGPRs. + addPass(&SILowerSGPRSpillsID); + + addPass(createVGPRAllocPass(false)); + return true; +} + +bool GCNPassConfig::addRegAssignmentOptimized() { + if (!usingDefaultRegAlloc()) + report_fatal_error(RegAllocOptNotSupportedMessage); + + addPass(createSGPRAllocPass(true)); + + addPreRewrite(); + + // Commit allocated register changes. This is mostly necessary because too + // many things rely on the use lists of the physical registers, such as the + // verifier. This is only necessary with allocators which use LiveIntervals, + // since FastRegAlloc does the replacments itself. + addPass(createVirtRegRewriter(false)); + + // Equivalent of PEI for SGPRs. + addPass(&SILowerSGPRSpillsID); + + addPass(createVGPRAllocPass(true)); + + addPreRewrite(); + addPass(&VirtRegRewriterID); + + addPass(&StackSlotColoringID); + + return true; +} + void GCNPassConfig::addPostRegAlloc() { addPass(&SIFixVGPRCopiesID); if (getOptLevel() > CodeGenOpt::None) Index: lib/Target/AMDGPU/CMakeLists.txt =================================================================== --- lib/Target/AMDGPU/CMakeLists.txt +++ lib/Target/AMDGPU/CMakeLists.txt @@ -109,6 +109,7 @@ SILoadStoreOptimizer.cpp SILowerControlFlow.cpp SILowerI1Copies.cpp + SILowerSGPRSpills.cpp SIMachineFunctionInfo.cpp SIMachineScheduler.cpp SIMemoryLegalizer.cpp Index: lib/Target/AMDGPU/SIFrameLowering.h =================================================================== --- lib/Target/AMDGPU/SIFrameLowering.h +++ lib/Target/AMDGPU/SIFrameLowering.h @@ -38,6 +38,9 @@ void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS = nullptr) const override; + void determineCalleeSavesSGPR(MachineFunction &MF, BitVector &SavedRegs, + RegScavenger *RS = nullptr) const; + void processFunctionBeforeFrameFinalized( MachineFunction &MF, RegScavenger *RS = nullptr) const override; Index: lib/Target/AMDGPU/SIFrameLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIFrameLowering.cpp +++ lib/Target/AMDGPU/SIFrameLowering.cpp @@ -590,15 +590,6 @@ .addImm(RoundedSize * ST.getWavefrontSize()) .setMIFlag(MachineInstr::FrameSetup); } - - for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg - : FuncInfo->getSGPRSpillVGPRs()) { - if (!Reg.FI.hasValue()) - continue; - TII->storeRegToStackSlot(MBB, MBBI, Reg.VGPR, true, - Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass, - &TII->getRegisterInfo()); - } } void SIFrameLowering::emitEpilogue(MachineFunction &MF, @@ -611,15 +602,6 @@ const SIInstrInfo *TII = ST.getInstrInfo(); MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); - for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg - : FuncInfo->getSGPRSpillVGPRs()) { - if (!Reg.FI.hasValue()) - continue; - TII->loadRegFromStackSlot(MBB, MBBI, Reg.VGPR, - Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass, - &TII->getRegisterInfo()); - } - unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg(); if (StackPtrReg == AMDGPU::NoRegister) return; @@ -671,47 +653,11 @@ const GCNSubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo &TRI = TII->getRegisterInfo(); - SIMachineFunctionInfo *FuncInfo = MF.getInfo(); - bool AllSGPRSpilledToVGPRs = false; - - if (TRI.spillSGPRToVGPR() && FuncInfo->hasSpilledSGPRs()) { - AllSGPRSpilledToVGPRs = true; - - // Process all SGPR spills before frame offsets are finalized. Ideally SGPRs - // are spilled to VGPRs, in which case we can eliminate the stack usage. - // - // XXX - This operates under the assumption that only other SGPR spills are - // users of the frame index. I'm not 100% sure this is correct. The - // StackColoring pass has a comment saying a future improvement would be to - // merging of allocas with spill slots, but for now according to - // MachineFrameInfo isSpillSlot can't alias any other object. - for (MachineBasicBlock &MBB : MF) { - MachineBasicBlock::iterator Next; - for (auto I = MBB.begin(), E = MBB.end(); I != E; I = Next) { - MachineInstr &MI = *I; - Next = std::next(I); - - if (TII->isSGPRSpill(MI)) { - int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex(); - assert(MFI.getStackID(FI) == SIStackID::SGPR_SPILL); - if (FuncInfo->allocateSGPRSpillToVGPR(MF, FI)) { - bool Spilled = TRI.eliminateSGPRToVGPRSpillFrameIndex(MI, FI, RS); - (void)Spilled; - assert(Spilled && "failed to spill SGPR to VGPR when allocated"); - } else - AllSGPRSpilledToVGPRs = false; - } - } - } - - FuncInfo->removeSGPRToVGPRFrameIndices(MFI); - } // FIXME: The other checks should be redundant with allStackObjectsAreDead, // but currently hasNonSpillStackObjects is set only from source // allocas. Stack temps produced from legalization are not counted currently. - if (FuncInfo->hasNonSpillStackObjects() || FuncInfo->hasSpilledVGPRs() || - !AllSGPRSpilledToVGPRs || !allStackObjectsAreDead(MFI)) { + if (!allStackObjectsAreDead(MFI)) { assert(RS && "RegScavenger required if spilling"); // We force this to be at offset 0 so no user object ever has 0 as an @@ -733,13 +679,27 @@ } } +// Only report VGPRs to generic code. void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS) const { TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); + const GCNSubtarget &ST = MF.getSubtarget(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + + SavedRegs.clearBitsNotInMask(TRI->getAllVGPRRegMask()); +} + +void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF, BitVector &SavedRegs, + RegScavenger *RS) const { + TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); const SIMachineFunctionInfo *MFI = MF.getInfo(); + const GCNSubtarget &ST = MF.getSubtarget(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + // The SP is specifically managed and we don't want extra spills of it. SavedRegs.reset(MFI->getStackPtrOffsetReg()); + SavedRegs.clearBitsInMask(TRI->getAllVGPRRegMask()); } MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr( Index: lib/Target/AMDGPU/SILowerSGPRSpills.cpp =================================================================== --- /dev/null +++ lib/Target/AMDGPU/SILowerSGPRSpills.cpp @@ -0,0 +1,299 @@ +//===-- SILowerSGPRSPills.cpp ---------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Handle SGPR spills. This pass takes the place of PrologEpilogInserter for all +// SGPR spills, so must insert CSR SGPR spills as well as expand them. +// +// This pass must never create new SGPR virtual registers. +// +// FIXME: Must stop RegScavenger spills in later passes. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/VirtRegMap.h" +#include "llvm/Target/TargetMachine.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-lower-sgpr-spills" + +using MBBVector = SmallVector; + +namespace { + +class SILowerSGPRSpills : public MachineFunctionPass { +private: + const SIRegisterInfo *TRI = nullptr; + const SIInstrInfo *TII = nullptr; + VirtRegMap *VRM = nullptr; + LiveIntervals *LIS = nullptr; + + + // Save and Restore blocks of the current function. Typically there is a + // single save block, unless Windows EH funclets are involved. + MBBVector SaveBlocks; + MBBVector RestoreBlocks; + +public: + static char ID; + + SILowerSGPRSpills() : MachineFunctionPass(ID) {} + + void calculateSaveRestoreBlocks(MachineFunction &MF); + bool spillCalleeSavedRegs(MachineFunction &MF); + + bool runOnMachineFunction(MachineFunction &MF) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // end anonymous namespace + +char SILowerSGPRSpills::ID = 0; + +INITIALIZE_PASS_BEGIN(SILowerSGPRSpills, DEBUG_TYPE, + "SI lower SGPR spill instructions", false, false) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_DEPENDENCY(VirtRegMap) +INITIALIZE_PASS_END(SILowerSGPRSpills, DEBUG_TYPE, + "SI lower SGPR spill instructions", false, false) + +char &llvm::SILowerSGPRSpillsID = SILowerSGPRSpills::ID; + +/// Insert restore code for the callee-saved registers used in the function. +static void insertCSRSaves(MachineBasicBlock &SaveBlock, + ArrayRef CSI, + LiveIntervals *LIS) { + MachineFunction &MF = *SaveBlock.getParent(); + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + + MachineBasicBlock::iterator I = SaveBlock.begin(); + if (!TFI->spillCalleeSavedRegisters(SaveBlock, I, CSI, TRI)) { + for (const CalleeSavedInfo &CS : CSI) { + // Insert the spill to the stack frame. + unsigned Reg = CS.getReg(); + + MachineInstrSpan MIS(I); + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + + TII.storeRegToStackSlot(SaveBlock, I, Reg, true, CS.getFrameIdx(), RC, + TRI); + + if (LIS) { + assert(std::distance(MIS.begin(), I) == 1); + MachineInstr &Inst = *std::prev(I); + + LIS->InsertMachineInstrInMaps(Inst); + LIS->removePhysReg(Reg); + } + } + } +} + +/// Insert restore code for the callee-saved registers used in the function. +static void insertCSRRestores(MachineBasicBlock &RestoreBlock, + std::vector &CSI, + LiveIntervals *LIS) { + MachineFunction &MF = *RestoreBlock.getParent(); + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + + // Restore all registers immediately before the return and any + // terminators that precede it. + MachineBasicBlock::iterator I = RestoreBlock.getFirstTerminator(); + + // FIXME: Just emit the readlane/writelane directly + if (!TFI->restoreCalleeSavedRegisters(RestoreBlock, I, CSI, TRI)) { + for (const CalleeSavedInfo &CI : reverse(CSI)) { + unsigned Reg = CI.getReg(); + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + + TII.loadRegFromStackSlot(RestoreBlock, I, Reg, CI.getFrameIdx(), RC, TRI); + assert(I != RestoreBlock.begin() && + "loadRegFromStackSlot didn't insert any code!"); + // Insert in reverse order. loadRegFromStackSlot can insert + // multiple instructions. + + if (LIS) { + MachineInstr &Inst = *std::prev(I); + LIS->InsertMachineInstrInMaps(Inst); + LIS->removePhysReg(Reg); + } + } + } +} + +/// Compute the sets of entry and return blocks for saving and restoring +/// callee-saved registers, and placing prolog and epilog code. +void SILowerSGPRSpills::calculateSaveRestoreBlocks(MachineFunction &MF) { + const MachineFrameInfo &MFI = MF.getFrameInfo(); + + // Even when we do not change any CSR, we still want to insert the + // prologue and epilogue of the function. + // So set the save points for those. + + // Use the points found by shrink-wrapping, if any. + if (MFI.getSavePoint()) { + SaveBlocks.push_back(MFI.getSavePoint()); + assert(MFI.getRestorePoint() && "Both restore and save must be set"); + MachineBasicBlock *RestoreBlock = MFI.getRestorePoint(); + // If RestoreBlock does not have any successor and is not a return block + // then the end point is unreachable and we do not need to insert any + // epilogue. + if (!RestoreBlock->succ_empty() || RestoreBlock->isReturnBlock()) + RestoreBlocks.push_back(RestoreBlock); + return; + } + + // Save refs to entry and return blocks. + SaveBlocks.push_back(&MF.front()); + for (MachineBasicBlock &MBB : MF) { + if (MBB.isEHFuncletEntry()) + SaveBlocks.push_back(&MBB); + if (MBB.isReturnBlock()) + RestoreBlocks.push_back(&MBB); + } +} + +bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) { + const Function &F = MF.getFunction(); + const GCNSubtarget &ST = MF.getSubtarget(); + const SIFrameLowering *TFI = ST.getFrameLowering(); + MachineFrameInfo &MFI = MF.getFrameInfo(); + RegScavenger *RS = nullptr; + + // Determine which of the registers in the callee save list should be saved. + BitVector SavedRegs; + TFI->determineCalleeSavesSGPR(MF, SavedRegs, RS); + + // Add the code to save and restore the callee saved registers. + if (!F.hasFnAttribute(Attribute::Naked)) { + MFI.setCalleeSavedInfoValid(true); + + std::vector CSI; + const MCPhysReg *CSRegs = TRI->getCalleeSavedRegs(&MF); + + for (unsigned I = 0; CSRegs[I]; ++I) { + unsigned Reg = CSRegs[I]; + if (SavedRegs.test(Reg)) { + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + + int JunkFI = MFI.CreateStackObject(TRI->getSpillSize(*RC), + TRI->getSpillAlignment(*RC), + true); + + CSI.push_back(CalleeSavedInfo(Reg, JunkFI)); + } + } + + if (!CSI.empty()) { + for (MachineBasicBlock *SaveBlock : SaveBlocks) + insertCSRSaves(*SaveBlock, CSI, LIS); + + for (MachineBasicBlock *RestoreBlock : RestoreBlocks) + insertCSRRestores(*RestoreBlock, CSI, LIS); + return true; + } + } + + return false; +} + +bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { + const GCNSubtarget &ST = MF.getSubtarget(); + TII = ST.getInstrInfo(); + TRI = &TII->getRegisterInfo(); + + VRM = getAnalysisIfAvailable(); + LIS = getAnalysisIfAvailable(); + + bool AllSGPRSpilledToVGPRs = false; + + assert(SaveBlocks.empty() && RestoreBlocks.empty()); + + // First, expose any CSR SGPR spills. This is mostly the same as what PEI + // does, but somewhat simpler. + calculateSaveRestoreBlocks(MF); + bool HasCSRs = spillCalleeSavedRegs(MF); + + MachineFrameInfo &MFI = MF.getFrameInfo(); + if (!MFI.hasStackObjects() && !HasCSRs) { + SaveBlocks.clear(); + RestoreBlocks.clear(); + return false; + } + + + SIMachineFunctionInfo *FuncInfo = MF.getInfo(); + bool MadeChange = false; + + if (TRI->spillSGPRToVGPR() && (HasCSRs || FuncInfo->hasSpilledSGPRs())) { + AllSGPRSpilledToVGPRs = true; + + // Process all SGPR spills before frame offsets are finalized. Ideally SGPRs + // are spilled to VGPRs, in which case we can eliminate the stack usage. + // + // This operates under the assumption that only other SGPR spills are users + // of the frame index. + for (MachineBasicBlock &MBB : MF) { + MachineBasicBlock::iterator Next; + for (auto I = MBB.begin(), E = MBB.end(); I != E; I = Next) { + MachineInstr &MI = *I; + Next = std::next(I); + + if (!TII->isSGPRSpill(MI)) + continue; + + int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex(); + assert(MFI.getStackID(FI) == SIStackID::SGPR_SPILL); + if (FuncInfo->allocateSGPRSpillToVGPR(MF, FI, LIS)) { + bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(MI, FI, nullptr, LIS); + (void)Spilled; + assert(Spilled && "failed to spill SGPR to VGPR when allocated"); + } else + AllSGPRSpilledToVGPRs = false; + } + } + + if (VRM) { + // We created new virtual registers for the SGPR spills, so we need to grow + // VirtRegMap + VRM->grow(); + } + + FuncInfo->removeSGPRToVGPRFrameIndices(MFI); + MadeChange = true; + } + + // Re-freeze reserved registers, as we've added new VGPRs to reserve. + if (MadeChange) + MF.getRegInfo().freezeReservedRegs(MF); + + SaveBlocks.clear(); + RestoreBlocks.clear(); + + return MadeChange; +} Index: lib/Target/AMDGPU/SIMachineFunctionInfo.h =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -21,7 +21,6 @@ #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/Optional.h" #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/PseudoSourceValue.h" #include "llvm/CodeGen/TargetInstrInfo.h" @@ -34,6 +33,7 @@ namespace llvm { +class LiveIntervals; class MachineFrameInfo; class MachineFunction; class TargetRegisterClass; @@ -202,16 +202,7 @@ bool hasReg() { return VGPR != 0;} }; - struct SGPRSpillVGPRCSR { - // VGPR used for SGPR spills - unsigned VGPR; - - // If the VGPR is a CSR, the stack slot used to save/restore it in the - // prolog/epilog. - Optional FI; - - SGPRSpillVGPRCSR(unsigned V, Optional F) : VGPR(V), FI(F) {} - }; + using SGPRSpillMap = DenseMap>; private: // SGPR->VGPR spilling support. @@ -219,9 +210,9 @@ // Track VGPR + wave index for each subregister of the SGPR spilled to // frameindex key. - DenseMap> SGPRToVGPRSpills; + SGPRSpillMap SGPRToVGPRSpills; unsigned NumVGPRSpillLanes = 0; - SmallVector SpillVGPRs; + SmallVector SpillVGPRs; public: SIMachineFunctionInfo(const MachineFunction &MF); @@ -232,11 +223,16 @@ ArrayRef() : makeArrayRef(I->second); } - ArrayRef getSGPRSpillVGPRs() const { + iterator_range sgpr_spill_vgprs() const { + return SGPRToVGPRSpills; + } + + ArrayRef getSGPRSpillVGPRs() const { return SpillVGPRs; } - bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI); + bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI, + LiveIntervals *LIS = nullptr); void removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI); bool hasCalculatedTID() const { return TIDReg != 0; }; Index: lib/Target/AMDGPU/SIMachineFunctionInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -14,11 +14,13 @@ #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/Optional.h" +#include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/IR/CallingConv.h" +#include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Function.h" #include #include @@ -230,18 +232,10 @@ return ArgInfo.ImplicitBufferPtr.getRegister(); } -static bool isCalleeSavedReg(const MCPhysReg *CSRegs, MCPhysReg Reg) { - for (unsigned I = 0; CSRegs[I]; ++I) { - if (CSRegs[I] == Reg) - return true; - } - - return false; -} - /// Reserve a slice of a VGPR to support spilling for FrameIndex \p FI. bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF, - int FI) { + int FI, + LiveIntervals *LIS) { std::vector &SpillLanes = SGPRToVGPRSpills[FI]; // This has already been allocated. @@ -249,6 +243,7 @@ return true; const GCNSubtarget &ST = MF.getSubtarget(); + const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); MachineFrameInfo &FrameInfo = MF.getFrameInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -260,8 +255,6 @@ int NumLanes = Size / 4; - const MCPhysReg *CSRegs = TRI->getCalleeSavedRegs(&MF); - // Make sure to handle the case where a wide SGPR spill may span between two // VGPRs. for (int I = 0; I < NumLanes; ++I, ++NumVGPRSpillLanes) { @@ -275,23 +268,25 @@ // partially spill the SGPR to VGPRs. SGPRToVGPRSpills.erase(FI); NumVGPRSpillLanes -= I; + + DiagnosticInfoResourceLimit DiagOutOfRegs(MF.getFunction(), + "VGPRs for SGPR spilling", + 0, DS_Error); + MF.getFunction().getContext().diagnose(DiagOutOfRegs); return false; } - Optional CSRSpillFI; - if ((FrameInfo.hasCalls() || !isEntryFunction()) && CSRegs && - isCalleeSavedReg(CSRegs, LaneVGPR)) { - CSRSpillFI = FrameInfo.CreateSpillStackObject(4, 4); - } + MachineBasicBlock &EntryBB = MF.front(); - SpillVGPRs.push_back(SGPRSpillVGPRCSR(LaneVGPR, CSRSpillFI)); + MachineInstr *ImpDef + = BuildMI(EntryBB, EntryBB.front(), + DebugLoc(), TII->get(TargetOpcode::IMPLICIT_DEF), LaneVGPR); + if (LIS) + LIS->InsertMachineInstrInMaps(*ImpDef); - // Add this register as live-in to all blocks to avoid machine verifer - // complaining about use of an undefined physical register. - for (MachineBasicBlock &BB : MF) - BB.addLiveIn(LaneVGPR); + SpillVGPRs.push_back(LaneVGPR); } else { - LaneVGPR = SpillVGPRs.back().VGPR; + LaneVGPR = SpillVGPRs.back(); } SpillLanes.push_back(SpilledReg(LaneVGPR, VGPRIndex)); @@ -305,7 +300,6 @@ MFI.RemoveStackObject(R.first); } - /// \returns VGPR used for \p Dim' work item ID. unsigned SIMachineFunctionInfo::getWorkItemIDVGPR(unsigned Dim) const { switch (Dim) { Index: lib/Target/AMDGPU/SIRegisterInfo.h =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.h +++ lib/Target/AMDGPU/SIRegisterInfo.h @@ -103,12 +103,23 @@ const MachineFunction &MF, unsigned Kind = 0) const override; /// If \p OnlyToVGPR is true, this will only succeed if this + bool spillSGPRImpl(MachineBasicBlock::iterator MI, + const DebugLoc &DL, + unsigned Reg, + bool IsKill, + int Index, + RegScavenger *RS, + LiveIntervals *LIS = nullptr, + bool OnlyToVGPR = false) const; + bool spillSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, + LiveIntervals *LIS = nullptr, bool OnlyToVGPR = false) const; bool restoreSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, + LiveIntervals *LIS = nullptr, bool OnlyToVGPR = false) const; void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, @@ -116,7 +127,8 @@ RegScavenger *RS) const override; bool eliminateSGPRToVGPRSpillFrameIndex(MachineBasicBlock::iterator MI, - int FI, RegScavenger *RS) const; + int FI, RegScavenger *RS, + LiveIntervals *LIS = nullptr) const; StringRef getRegAsmName(unsigned Reg) const override; @@ -188,7 +200,6 @@ unsigned findUnusedRegister(const MachineRegisterInfo &MRI, const TargetRegisterClass *RC, const MachineFunction &MF) const; - unsigned getSGPRPressureSet() const { return SGPRSetID; }; unsigned getVGPRPressureSet() const { return VGPRSetID; }; @@ -234,6 +245,8 @@ MachineRegisterInfo &MRI, LiveIntervals *LIS) const; + const uint32_t *getAllVGPRRegMask() const; + private: void buildSpillLoadStore(MachineBasicBlock::iterator MI, unsigned LoadStoreOp, Index: lib/Target/AMDGPU/SIRegisterInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.cpp +++ lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -225,6 +225,15 @@ assert(!isSubRegister(ScratchRSrcReg, FrameReg)); } + // Reserve VGPRs used for SGPR spilling. + // Note we treat freezeReservedRegs unusually because we run register + // allocation in two phases. It's OK to re-freeze with new registers for the + // second run. + for (auto &SpilledFI : MFI->sgpr_spill_vgprs()) { + for (auto &SpilledVGPR : SpilledFI.second) + reserveRegisterTuples(Reserved, SpilledVGPR.VGPR); + } + return Reserved; } @@ -640,10 +649,14 @@ AMDGPU::S_BUFFER_LOAD_DWORD_SGPR}; } -bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, - int Index, - RegScavenger *RS, - bool OnlyToVGPR) const { +bool SIRegisterInfo::spillSGPRImpl(MachineBasicBlock::iterator MI, + const DebugLoc &DL, + unsigned SuperReg, + bool IsKill, + int Index, + RegScavenger *RS, + LiveIntervals *LIS, + bool OnlyToVGPR) const { MachineBasicBlock *MBB = MI->getParent(); MachineFunction *MF = MBB->getParent(); SIMachineFunctionInfo *MFI = MF->getInfo(); @@ -659,12 +672,9 @@ const GCNSubtarget &ST = MF->getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); - unsigned SuperReg = MI->getOperand(0).getReg(); - bool IsKill = MI->getOperand(0).isKill(); - const DebugLoc &DL = MI->getDebugLoc(); - MachineFrameInfo &FrameInfo = MF->getFrameInfo(); + bool SpillToSMEM = spillSGPRToSMEM(); if (SpillToSMEM && OnlyToVGPR) return false; @@ -748,25 +758,22 @@ if (SpillToVGPR) { SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i]; - // During SGPR spilling to VGPR, determine if the VGPR is defined. The - // only circumstance in which we say it is undefined is when it is the - // first spill to this VGPR in the first basic block. - bool VGPRDefined = true; - if (MBB == &MF->front()) - VGPRDefined = !SGPRSpillVGPRDefinedSet.insert(Spill.VGPR).second; - // Mark the "old value of vgpr" input undef only if this is the first sgpr // spill to this specific vgpr in the first basic block. - BuildMI(*MBB, MI, DL, + MachineInstr *Writelane = BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), Spill.VGPR) .addReg(SubReg, getKillRegState(IsKill)) .addImm(Spill.Lane) - .addReg(Spill.VGPR, VGPRDefined ? 0 : RegState::Undef); + .addReg(Spill.VGPR); + + if (LIS) { + if (i == 0) + LIS->ReplaceMachineInstrInMaps(*MI, *Writelane); + else + LIS->InsertMachineInstrInMaps(*Writelane); + } - // FIXME: Since this spills to another register instead of an actual - // frame index, we should delete the frame index when all references to - // it are fixed. } else { // XXX - Can to VGPR spill fail for some subregisters but not others? if (OnlyToVGPR) @@ -813,14 +820,34 @@ .addReg(M0CopyReg, RegState::Kill); } - MI->eraseFromParent(); MFI->addToSpilledSGPRs(NumSubRegs); + + if (LIS) + LIS->removePhysReg(SuperReg); + return true; } +bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, + int Index, + RegScavenger *RS, + LiveIntervals *LIS, + bool OnlyToVGPR) const { + + unsigned SuperReg = MI->getOperand(0).getReg(); + bool IsKill = MI->getOperand(0).isKill(); + const DebugLoc &DL = MI->getDebugLoc(); + + auto Ret = spillSGPRImpl(MI, DL, SuperReg, IsKill, Index, + RS, LIS, OnlyToVGPR); + MI->eraseFromParent(); + return Ret; +} + bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, int Index, RegScavenger *RS, + LiveIntervals *LIS, bool OnlyToVGPR) const { MachineFunction *MF = MI->getParent()->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); @@ -859,7 +886,7 @@ unsigned EltSize = 4; unsigned ScalarLoadOp; - const TargetRegisterClass *RC = getPhysRegClass(SuperReg); + const TargetRegisterClass *RC = getRegClassForReg(MRI, SuperReg); if (SpillToSMEM && isSGPRClass(RC)) { // XXX - if private_element_size is larger than 4 it might be useful to be // able to spill wider vmem spills. @@ -920,6 +947,14 @@ if (NumSubRegs > 1 && i == 0) MIB.addReg(SuperReg, RegState::ImplicitDefine); + + if (LIS) { + if (i == e - 1) + LIS->ReplaceMachineInstrInMaps(*MI, *MIB); + else + LIS->InsertMachineInstrInMaps(*MIB); + } + } else { if (OnlyToVGPR) return false; @@ -958,6 +993,10 @@ } MI->eraseFromParent(); + + if (LIS) + LIS->removePhysReg(SuperReg); + return true; } @@ -967,20 +1006,21 @@ bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex( MachineBasicBlock::iterator MI, int FI, - RegScavenger *RS) const { + RegScavenger *RS, + LiveIntervals *LIS) const { switch (MI->getOpcode()) { case AMDGPU::SI_SPILL_S512_SAVE: case AMDGPU::SI_SPILL_S256_SAVE: case AMDGPU::SI_SPILL_S128_SAVE: case AMDGPU::SI_SPILL_S64_SAVE: case AMDGPU::SI_SPILL_S32_SAVE: - return spillSGPR(MI, FI, RS, true); + return spillSGPR(MI, FI, RS, LIS, true); case AMDGPU::SI_SPILL_S512_RESTORE: case AMDGPU::SI_SPILL_S256_RESTORE: case AMDGPU::SI_SPILL_S128_RESTORE: case AMDGPU::SI_SPILL_S64_RESTORE: case AMDGPU::SI_SPILL_S32_RESTORE: - return restoreSGPR(MI, FI, RS, true); + return restoreSGPR(MI, FI, RS, LIS, true); default: llvm_unreachable("not an SGPR spill instruction"); } @@ -1076,6 +1116,8 @@ // In an entry function/kernel the stack address is already the // absolute address relative to the scratch wave offset. + // FIXME: We really need to guarantee this can never require a spill, + // since SGPR spills are assumed to be all handled already during PEI. unsigned DiffReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); Index: test/CodeGen/AMDGPU/callee-frame-setup.ll =================================================================== --- test/CodeGen/AMDGPU/callee-frame-setup.ll +++ test/CodeGen/AMDGPU/callee-frame-setup.ll @@ -37,14 +37,14 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt ; GCN: s_mov_b32 s5, s32 -; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8 +; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; GCN-DAG: v_writelane_b32 v32, s33, ; GCN-DAG: v_writelane_b32 v32, s34, ; GCN-DAG: v_writelane_b32 v32, s35, ; GCN-DAG: s_add_u32 s32, s32, 0x400{{$}} ; GCN-DAG: v_mov_b32_e32 v0, 0{{$}} -; GCN-DAG: buffer_store_dword v0, off, s[0:3], s5 offset:4{{$}} +; GCN-DAG: buffer_store_dword v0, off, s[0:3], s5 offset:8{{$}} ; GCN-DAG: s_mov_b32 s33, s5 @@ -53,7 +53,7 @@ ; GCN-DAG: v_readlane_b32 s35, ; GCN-DAG: v_readlane_b32 s34, ; GCN-DAG: v_readlane_b32 s33, -; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:8 +; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; GCN: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @callee_with_stack_and_call() #0 { @@ -123,5 +123,57 @@ ret void } +; We can use a non-csr in a leaf function. + +; GCN-LABEL: {{^}}callee_func_sgpr_spill_no_calls_low_regs: +; GCN-NOT: buffer_store_dword +; GCN: v_writelane_b32 v8, +; GCN: v_readlane_b32 s{{[0-9]+}}, v8 +; GCN-NOT: buffer_load_dword +define void @callee_func_sgpr_spill_no_calls_low_regs(i32 %in) #0 { + call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0 + + %wide.sgpr0 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr1 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr2 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr5 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr3 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr4 = call <2 x i32> asm sideeffect "; def $0", "=s" () #0 + + call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr0) #0 + call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr1) #0 + call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr2) #0 + call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr3) #0 + call void asm sideeffect "; use $0", "s"(<2 x i32> %wide.sgpr4) #0 + call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr5) #0 + ret void +} + +; GCN-LABEL: {{^}}callee_func_sgpr_spill_calls_low_regs: +; GCN: buffer_store_dword v32 +; GCN: v_writelane_b32 v32, +; GCN: v_readlane_b32 s{{[0-9]+}}, v32 +; GCN: buffer_load_dword v32 +define void @callee_func_sgpr_spill_calls_low_regs(i32 %in) #0 { + call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0 + + %wide.sgpr0 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr1 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr2 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr5 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr3 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr4 = call <2 x i32> asm sideeffect "; def $0", "=s" () #0 + + call void @external_void_func_void() + + call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr0) #0 + call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr1) #0 + call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr2) #0 + call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr3) #0 + call void asm sideeffect "; use $0", "s"(<2 x i32> %wide.sgpr4) #0 + call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr5) #0 + ret void +} + attributes #0 = { nounwind } attributes #1 = { nounwind "no-frame-pointer-elim"="true" } Index: test/CodeGen/AMDGPU/callee-special-input-sgprs.ll =================================================================== --- test/CodeGen/AMDGPU/callee-special-input-sgprs.ll +++ test/CodeGen/AMDGPU/callee-special-input-sgprs.ll @@ -577,7 +577,7 @@ ; GCN: s_swappc_b64 -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset:4 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset:8 ; GCN-DAG: v_mov_b32_e32 v[[LO1:[0-9]+]], s[[LO_X]] ; GCN-DAG: v_mov_b32_e32 v[[HI1:[0-9]+]], s[[HI_X]] ; GCN-DAG: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO1]]:[[HI1]]{{\]}} Index: test/CodeGen/AMDGPU/callee-special-input-vgprs.ll =================================================================== --- test/CodeGen/AMDGPU/callee-special-input-vgprs.ll +++ test/CodeGen/AMDGPU/callee-special-input-vgprs.ll @@ -326,15 +326,18 @@ ; Requires loading and storing to stack slot. ; GCN-LABEL: {{^}}too_many_args_call_too_many_args_use_workitem_id_x: -; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Spill +; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:12 ; 4-byte Folded Spill +; GCN: buffer_store_dword v33, off, s[0:3], s5 offset:8 ; 4-byte Folded Spill +; GCN: v_writelane_b32 v32 ; GCN: s_add_u32 s32, s32, 0x400{{$}} -; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4 +; GCN: buffer_load_dword v33, off, s[0:3], s5 offset:4 -; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4{{$}} +; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:4{{$}} ; GCN: s_swappc_b64 -; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Reload +; GCN: buffer_load_dword v33, off, s[0:3], s5 offset:8 ; 4-byte Folded Reload +; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:12 ; 4-byte Folded Reload ; GCN: s_sub_u32 s32, s32, 0x400{{$}} ; GCN: s_setpc_b64 define void @too_many_args_call_too_many_args_use_workitem_id_x( @@ -452,10 +455,10 @@ ; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x_byval: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}} -; GCN: buffer_store_dword [[K]], off, s[0:3], s5 offset:4 +; GCN: buffer_store_dword [[K]], off, s[0:3], s5 offset:8 ; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:8 -; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s5 offset:4 +; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s5 offset:8 ; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32 offset:4{{$}} ; GCN: v_mov_b32_e32 [[RELOAD_BYVAL]], ; GCN: s_swappc_b64 Index: test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll =================================================================== --- test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll +++ test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll @@ -29,6 +29,7 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s5, s32 ; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: ; implicit-def: $vgpr32 ; GCN-NEXT: v_writelane_b32 v32, s33, 0 ; GCN-NEXT: v_writelane_b32 v32, s34, 1 ; GCN-NEXT: s_add_u32 s32, s32, 0x400 @@ -63,6 +64,7 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s5, s32 ; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: ; implicit-def: $vgpr32 ; GCN-NEXT: v_writelane_b32 v32, s33, 0 ; GCN-NEXT: v_writelane_b32 v32, s34, 1 ; GCN-NEXT: s_add_u32 s32, s32, 0x400 @@ -97,6 +99,7 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s5, s32 ; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: ; implicit-def: $vgpr32 ; GCN-NEXT: v_writelane_b32 v32, s33, 0 ; GCN-NEXT: v_writelane_b32 v32, s34, 1 ; GCN-NEXT: s_add_u32 s32, s32, 0x400 @@ -131,6 +134,7 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s5, s32 ; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: ; implicit-def: $vgpr32 ; GCN-NEXT: v_writelane_b32 v32, s33, 0 ; GCN-NEXT: v_writelane_b32 v32, s34, 1 ; GCN-NEXT: s_add_u32 s32, s32, 0x400 Index: test/CodeGen/AMDGPU/debug-value2.ll =================================================================== --- test/CodeGen/AMDGPU/debug-value2.ll +++ test/CodeGen/AMDGPU/debug-value2.ll @@ -10,9 +10,9 @@ define <4 x float> @Scene_transformT(i32 %subshapeIdx, <4 x float> %v, float %time, i8 addrspace(1)* %gScene, i32 addrspace(1)* %gSceneOffsets) local_unnamed_addr !dbg !110 { entry: +; CHECK: ;DEBUG_VALUE: Scene_transformT:gSceneOffsets <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef] $vgpr8_vgpr9 ; CHECK: ;DEBUG_VALUE: Scene_transformT:gScene <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef] $vgpr6_vgpr7 call void @llvm.dbg.value(metadata i8 addrspace(1)* %gScene, metadata !120, metadata !DIExpression(DW_OP_constu, 1, DW_OP_swap, DW_OP_xderef)), !dbg !154 -; CHECK: ;DEBUG_VALUE: Scene_transformT:gSceneOffsets <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef] $vgpr8_vgpr9 call void @llvm.dbg.value(metadata i32 addrspace(1)* %gSceneOffsets, metadata !121, metadata !DIExpression(DW_OP_constu, 1, DW_OP_swap, DW_OP_xderef)), !dbg !155 %call = tail call %struct.ShapeData addrspace(1)* @Scene_getSubShapeData(i32 %subshapeIdx, i8 addrspace(1)* %gScene, i32 addrspace(1)* %gSceneOffsets) %m_linearMotion = getelementptr inbounds %struct.ShapeData, %struct.ShapeData addrspace(1)* %call, i64 0, i32 2 Index: test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll =================================================================== --- test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll +++ test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll @@ -10,6 +10,11 @@ ; GCN-LABEL: {{^}}spill_sgprs_to_multiple_vgprs: +; GCN: ; implicit-def: $vgpr2 +; GCN: ; implicit-def: $vgpr1 +; GCN: ; implicit-def: $vgpr0 + + ; GCN: def s[4:11] ; GCN: v_writelane_b32 v0, s4, 0 ; GCN-NEXT: v_writelane_b32 v0, s5, 1 @@ -454,187 +459,5 @@ ret void } -; The first 64 SGPR spills can go to a VGPR, but there isn't a second -; so some spills must be to memory. The last 16 element spill runs out of lanes at the 15th element. - -; GCN-LABEL: {{^}}no_vgprs_last_sgpr_spill: - -; GCN: v_writelane_b32 v23, s{{[0-9]+}}, 0 -; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 1 -; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 2 -; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 3 -; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 4 -; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 5 -; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 6 -; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 7 -; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 8 -; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 9 -; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 10 -; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 11 -; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 12 -; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 13 -; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 14 -; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 15 - -; GCN: v_writelane_b32 v23, s{{[0-9]+}}, 16 -; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 17 -; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 18 -; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 19 -; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 20 -; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 21 -; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 22 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 23 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 24 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 25 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 26 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 27 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 28 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 29 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 30 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 31 - -; GCN: def s[4:19] -; GCN: v_writelane_b32 v23, s4, 32 -; GCN-NEXT: v_writelane_b32 v23, s5, 33 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 34 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 35 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 36 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 37 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 38 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 39 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 40 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 41 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 42 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 43 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 44 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 45 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 46 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 47 - -; GCN: def s[4:19] -; GCN: v_writelane_b32 v23, s{{[[0-9]+}}, 48 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 49 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 50 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 51 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 52 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 53 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 54 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 55 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 56 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 57 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 58 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 59 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 60 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 61 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 62 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 63 - -; GCN: def s[4:5] -; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} -; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} -; GCN: s_cbranch_scc1 - - -; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} -; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} - -; GCN: v_readlane_b32 s20, v23, 32 -; GCN-NEXT: v_readlane_b32 s21, v23, 33 -; GCN-NEXT: v_readlane_b32 s22, v23, 34 -; GCN-NEXT: v_readlane_b32 s23, v23, 35 -; GCN-NEXT: v_readlane_b32 s24, v23, 36 -; GCN-NEXT: v_readlane_b32 s25, v23, 37 -; GCN-NEXT: v_readlane_b32 s26, v23, 38 -; GCN-NEXT: v_readlane_b32 s27, v23, 39 -; GCN-NEXT: v_readlane_b32 s28, v23, 40 -; GCN-NEXT: v_readlane_b32 s29, v23, 41 -; GCN-NEXT: v_readlane_b32 s30, v23, 42 -; GCN-NEXT: v_readlane_b32 s31, v23, 43 -; GCN-NEXT: v_readlane_b32 s32, v23, 44 -; GCN-NEXT: v_readlane_b32 s33, v23, 45 -; GCN-NEXT: v_readlane_b32 s34, v23, 46 -; GCN-NEXT: v_readlane_b32 s35, v23, 47 - -; GCN: v_readlane_b32 s[[USE_TMP_LO:[0-9]+]], v23, 0 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 1 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 2 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 3 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 4 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 5 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 6 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 7 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 8 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 9 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 10 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 11 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 12 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 13 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 14 -; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI:[0-9]+]], v23, 15 -; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}} - -; GCN: v_readlane_b32 s[[USE_TMP_LO:[0-9]+]], v23, 16 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 17 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 18 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 19 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 20 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 21 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 22 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 23 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 24 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 25 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 26 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 27 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 28 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 29 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 30 -; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI:[0-9]+]], v23, 31 -; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}} - -; GCN: v_readlane_b32 s[[USE_TMP_LO:[0-9]+]], v23, 48 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 49 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 50 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 51 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 52 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 53 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 54 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 55 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 56 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 57 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 58 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 59 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 60 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 61 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 62 -; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI:[0-9]+]], v23, 63 -; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}} -; GCN: ; use s[0:1] -define amdgpu_kernel void @no_vgprs_last_sgpr_spill(i32 addrspace(1)* %out, i32 %in) #1 { - call void asm sideeffect "", "~{v[0:7]}" () #0 - call void asm sideeffect "", "~{v[8:15]}" () #0 - call void asm sideeffect "", "~{v[16:19]}"() #0 - call void asm sideeffect "", "~{v[20:21]}"() #0 - call void asm sideeffect "", "~{v22}"() #0 - - %wide.sgpr0 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 - %wide.sgpr1 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 - %wide.sgpr2 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 - %wide.sgpr3 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 - %wide.sgpr4 = call <2 x i32> asm sideeffect "; def $0", "=s" () #0 - %cmp = icmp eq i32 %in, 0 - br i1 %cmp, label %bb0, label %ret - -bb0: - call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr0) #0 - call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr1) #0 - call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr2) #0 - call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr3) #0 - call void asm sideeffect "; use $0", "s"(<2 x i32> %wide.sgpr4) #0 - br label %ret - -ret: - ret void -} - attributes #0 = { nounwind } attributes #1 = { nounwind "amdgpu-waves-per-eu"="10,10" } Index: test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll @@ -0,0 +1,106 @@ +; REQUIRES: asserts + +; RUN: llc -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=DEFAULT %s +; RUN: llc -sgpr-regalloc=greedy -vgpr-regalloc=greedy -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=DEFAULT %s + +; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=O0 %s + +; RUN: llc -vgpr-regalloc=basic -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=DEFAULT-BASIC %s +; RUN: llc -sgpr-regalloc=basic -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=BASIC-DEFAULT %s +; RUN: llc -sgpr-regalloc=basic -vgpr-regalloc=basic -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=BASIC-BASIC %s + +; RUN: not llc -regalloc=basic -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=REGALLOC %s +; RUN: not llc -regalloc=fast -O0 -mtriple=amdgcn-amd-amdhsa -debug-pass=Structure -o /dev/null %s 2>&1 | FileCheck -check-prefix=REGALLOC %s + + +; REGALLOC: -regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc + +; DEFAULT: Greedy Register Allocator +; DEFAULT-NEXT: Virtual Register Rewriter +; DEFAULT-NEXT: SI lower SGPR spill instructions +; DEFAULT-NEXT: Debug Variable Analysis +; DEFAULT-NEXT: Virtual Register Map +; DEFAULT-NEXT: Live Register Matrix +; DEFAULT-NEXT: Machine Optimization Remark Emitter +; DEFAULT-NEXT: Greedy Register Allocator +; DEFAULT-NEXT: Virtual Register Rewriter +; DEFAULT-NEXT: Stack Slot Coloring + +; O0: Fast Register Allocator +; O0-NEXT: SI lower SGPR spill instructions +; O0-NEXT: Fast Register Allocator +; O0-NEXT: SI Fix VGPR copies + + + + +; BASIC-DEFAULT: Debug Variable Analysis +; BASIC-DEFAULT-NEXT: Live Stack Slot Analysis +; BASIC-DEFAULT-NEXT: Machine Block Frequency Analysis +; BASIC-DEFAULT-NEXT: Virtual Register Map +; BASIC-DEFAULT-NEXT: Live Register Matrix +; BASIC-DEFAULT-NEXT: Basic Register Allocator +; BASIC-DEFAULT-NEXT: Virtual Register Rewriter +; BASIC-DEFAULT-NEXT: SI lower SGPR spill instructions +; BASIC-DEFAULT-NEXT: Debug Variable Analysis +; BASIC-DEFAULT-NEXT: Virtual Register Map +; BASIC-DEFAULT-NEXT: Live Register Matrix +; BASIC-DEFAULT-NEXT: Bundle Machine CFG Edges +; BASIC-DEFAULT-NEXT: Spill Code Placement Analysis +; BASIC-DEFAULT-NEXT: Lazy Machine Block Frequency Analysis +; BASIC-DEFAULT-NEXT: Machine Optimization Remark Emitter +; BASIC-DEFAULT-NEXT: Greedy Register Allocator +; BASIC-DEFAULT-NEXT: Virtual Register Rewriter +; BASIC-DEFAULT-NEXT: Stack Slot Coloring + + + +; DEFAULT-BASIC: Greedy Register Allocator +; DEFAULT-BASIC-NEXT: Virtual Register Rewriter +; DEFAULT-BASIC-NEXT: SI lower SGPR spill instructions +; DEFAULT-BASIC-NEXT: Debug Variable Analysis +; DEFAULT-BASIC-NEXT: Virtual Register Map +; DEFAULT-BASIC-NEXT: Live Register Matrix +; DEFAULT-BASIC-NEXT: Basic Register Allocator +; DEFAULT-BASIC-NEXT: Virtual Register Rewriter +; DEFAULT-BASIC-NEXT: Stack Slot Coloring + + + +; BASIC-BASIC: Debug Variable Analysis +; BASIC-BASIC-NEXT: Live Stack Slot Analysis +; BASIC-BASIC-NEXT: Machine Block Frequency Analysis +; BASIC-BASIC-NEXT: Virtual Register Map +; BASIC-BASIC-NEXT: Live Register Matrix +; BASIC-BASIC-NEXT: Basic Register Allocator +; BASIC-BASIC-NEXT: Virtual Register Rewriter +; BASIC-BASIC-NEXT: SI lower SGPR spill instructions +; BASIC-BASIC-NEXT: Debug Variable Analysis +; BASIC-BASIC-NEXT: Virtual Register Map +; BASIC-BASIC-NEXT: Live Register Matrix +; BASIC-BASIC-NEXT: Basic Register Allocator +; BASIC-BASIC-NEXT: Virtual Register Rewriter +; BASIC-BASIC-NEXT: Stack Slot Coloring + + +declare void @bar() + +; Something with some CSR SGPR spills +define void @foo() { + call void asm sideeffect "; clobber", "~{s33}"() + call void @bar() + ret void +} + +; Block live out spills with fast regalloc +define amdgpu_kernel void @control_flow(i1 %cond) { + %s33 = call i32 asm sideeffect "; clobber", "={s33}"() + br i1 %cond, label %bb0, label %bb1 + +bb0: + call void asm sideeffect "; use %0", "s"(i32 %s33) + br label %bb1 + +bb1: + ret void +} Index: test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll @@ -0,0 +1,189 @@ +; RUN: not llc -O0 -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: not llc -O0 -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR %s + +; ERROR: error: VGPRs for SGPR spilling limit exceeded (0) in partial_no_vgprs_last_sgpr_spill + +; The first 64 SGPR spills can go to a VGPR, but there isn't a second +; so some spills must be to memory. The last 16 element spill runs out of lanes at the 15th element. + +; GCN-LABEL: {{^}}partial_no_vgprs_last_sgpr_spill: + +; GCN: v_writelane_b32 v23, s{{[0-9]+}}, 0 +; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 1 +; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 2 +; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 3 +; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 4 +; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 5 +; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 6 +; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 7 +; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 8 +; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 9 +; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 10 +; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 11 +; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 12 +; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 13 +; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 14 +; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 15 + +; GCN: v_writelane_b32 v23, s{{[0-9]+}}, 16 +; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 17 +; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 18 +; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 19 +; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 20 +; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 21 +; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 22 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 23 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 24 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 25 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 26 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 27 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 28 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 29 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 30 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 31 + +; GCN: def s[4:19] +; GCN: v_writelane_b32 v23, s4, 32 +; GCN-NEXT: v_writelane_b32 v23, s5, 33 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 34 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 35 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 36 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 37 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 38 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 39 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 40 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 41 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 42 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 43 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 44 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 45 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 46 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 47 + +; GCN: def s[4:19] +; GCN: v_writelane_b32 v23, s{{[[0-9]+}}, 48 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 49 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 50 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 51 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 52 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 53 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 54 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 55 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 56 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 57 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 58 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 59 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 60 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 61 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 62 +; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 63 + +; GCN: def s[4:5] +; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +; GCN: s_cbranch_scc1 + + +; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} + +; GCN: v_readlane_b32 s20, v23, 32 +; GCN-NEXT: v_readlane_b32 s21, v23, 33 +; GCN-NEXT: v_readlane_b32 s22, v23, 34 +; GCN-NEXT: v_readlane_b32 s23, v23, 35 +; GCN-NEXT: v_readlane_b32 s24, v23, 36 +; GCN-NEXT: v_readlane_b32 s25, v23, 37 +; GCN-NEXT: v_readlane_b32 s26, v23, 38 +; GCN-NEXT: v_readlane_b32 s27, v23, 39 +; GCN-NEXT: v_readlane_b32 s28, v23, 40 +; GCN-NEXT: v_readlane_b32 s29, v23, 41 +; GCN-NEXT: v_readlane_b32 s30, v23, 42 +; GCN-NEXT: v_readlane_b32 s31, v23, 43 +; GCN-NEXT: v_readlane_b32 s32, v23, 44 +; GCN-NEXT: v_readlane_b32 s33, v23, 45 +; GCN-NEXT: v_readlane_b32 s34, v23, 46 +; GCN-NEXT: v_readlane_b32 s35, v23, 47 + +; GCN: v_readlane_b32 s[[USE_TMP_LO:[0-9]+]], v23, 0 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 1 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 2 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 3 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 4 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 5 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 6 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 7 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 8 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 9 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 10 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 11 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 12 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 13 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 14 +; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI:[0-9]+]], v23, 15 +; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}} + +; GCN: v_readlane_b32 s[[USE_TMP_LO:[0-9]+]], v23, 16 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 17 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 18 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 19 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 20 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 21 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 22 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 23 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 24 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 25 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 26 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 27 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 28 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 29 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 30 +; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI:[0-9]+]], v23, 31 +; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}} + +; GCN: v_readlane_b32 s[[USE_TMP_LO:[0-9]+]], v23, 48 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 49 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 50 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 51 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 52 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 53 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 54 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 55 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 56 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 57 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 58 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 59 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 60 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 61 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 62 +; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI:[0-9]+]], v23, 63 +; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}} +; GCN: ; use s[0:1] +define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(i32 addrspace(1)* %out, i32 %in) #1 { + call void asm sideeffect "", "~{v[0:7]}" () #0 + call void asm sideeffect "", "~{v[8:15]}" () #0 + call void asm sideeffect "", "~{v[16:19]}"() #0 + call void asm sideeffect "", "~{v[20:21]}"() #0 + call void asm sideeffect "", "~{v22}"() #0 + + %wide.sgpr0 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr1 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr2 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr3 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 + %wide.sgpr4 = call <2 x i32> asm sideeffect "; def $0", "=s" () #0 + %cmp = icmp eq i32 %in, 0 + br i1 %cmp, label %bb0, label %ret + +bb0: + call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr0) #0 + call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr1) #0 + call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr2) #0 + call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr3) #0 + call void asm sideeffect "; use $0", "s"(<2 x i32> %wide.sgpr4) #0 + br label %ret + +ret: + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind "amdgpu-waves-per-eu"="10,10" } Index: test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir =================================================================== --- test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir +++ test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir @@ -1,5 +1,8 @@ -# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stress-regalloc=3 -start-before=greedy -stop-after=stack-slot-coloring -o - %s | FileCheck -check-prefixes=SHARE,GCN %s -# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stress-regalloc=3 -start-before=greedy -stop-after=stack-slot-coloring -no-stack-slot-sharing -o - %s | FileCheck -check-prefixes=NOSHARE,GCN %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stress-regalloc=3 -run-pass=greedy,virtregrewriter,stack-slot-coloring -o - %s | FileCheck -check-prefixes=SHARE,GCN %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stress-regalloc=3 -run-pass=greedy,virtregrewriter,stack-slot-coloring -no-stack-slot-sharing -o - %s | FileCheck -check-prefixes=NOSHARE,GCN %s + +# -run-pass is used to artifically avoid using split register allocation, which would avoid stressing StackSlotColoring. + # Make sure that stack slot coloring doesn't try to merge frame # indexes used for SGPR spilling with those that aren't. Index: test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll =================================================================== --- test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll +++ test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll @@ -1,18 +1,39 @@ -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=SGPR %s +; RUN: not llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR %s ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -amdgpu-spill-sgpr-to-smem=1 -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=SMEM %s +; Previously, SGPR spilling to VGPRs was handled in a single register +; allocation run. It was possible to not have any free VGPRs for SGPR +; spilling, requiring writing out to memory which didn't work +; well. Test situations where this used to be necessary. + +; ERROR: error: VGPRs for SGPR spilling limit exceeded (0) in test + ; Make sure this doesn't crash. ; ALL-LABEL: {{^}}test: -; ALL: s_mov_b32 s[[LO:[0-9]+]], SCRATCH_RSRC_DWORD0 -; ALL: s_mov_b32 s[[OFF:[0-9]+]], s3 -; ALL: s_mov_b32 s[[HI:[0-9]+]], 0xe80000 -; Make sure we are handling hazards correctly. -; SGPR: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:16 -; SGPR-NEXT: s_waitcnt vmcnt(0) -; SGPR-NEXT: v_readfirstlane_b32 s[[HI:[0-9]+]], [[VHI]] -; SGPR-NEXT: s_nop 4 -; SGPR-NEXT: buffer_store_dword v0, off, s[0:[[HI]]{{\]}}, 0 +; Initialize VGPR for spilling +; SGPR: ; implicit-def: $vgpr[[SPILL_VGPR:[0-9]+]] + +; ALL-DAG: s_mov_b32 s[[LO:[0-9]+]], SCRATCH_RSRC_DWORD0 +; ALL-DAG: s_mov_b32 s[[OFF:[0-9]+]], s3 +; ALL-DAG: s_mov_b32 s[[HI:[0-9]+]], 0xe80000 + +; SGPR-DAG: v_writelane_b32 v[[SPILL_VGPR]], s{{[0-9]+}}, 0 +; SGPR-DAG: v_writelane_b32 v[[SPILL_VGPR]], s{{[0-9]+}}, 1 +; SGPR-DAG: v_writelane_b32 v[[SPILL_VGPR]], s{{[0-9]+}}, 2 +; SGPR-DAG: v_writelane_b32 v[[SPILL_VGPR]], s{{[0-9]+}}, 3 + +; Treating the VGPR as a normal value has the disadvantage of +; increasing the amount of spill code with fast regalloc +; SGPR: buffer_store_dword v[[SPILL_VGPR]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4 ; 4-byte Folded Spill + +; SGPR: ;;#ASMSTART +; SGPR: buffer_load_dword v[[VGPR_RESTORE:[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4 ; 4-byte Folded Reload +; SGPR: v_readlane_b32 s{{[0-9]+}}, v[[VGPR_RESTORE]], 0 +; SGPR: v_readlane_b32 s{{[0-9]+}}, v[[VGPR_RESTORE]], 1 +; SGPR: v_readlane_b32 s{{[0-9]+}}, v[[VGPR_RESTORE]], 2 +; SGPR: v_readlane_b32 s{{[0-9]+}}, v[[VGPR_RESTORE]], 3 + ; Make sure scratch wave offset register is correctly incremented and ; then restored. Index: test/CodeGen/AMDGPU/sibling-call.ll =================================================================== --- test/CodeGen/AMDGPU/sibling-call.ll +++ test/CodeGen/AMDGPU/sibling-call.ll @@ -207,12 +207,12 @@ ; Have another non-tail in the function ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_other_call: ; GCN: s_mov_b32 s5, s32 -; GCN: buffer_store_dword v34, off, s[0:3], s5 offset:12 -; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Spill -; GCN: buffer_store_dword v33, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill -; GCN-DAG: v_writelane_b32 v34, s33, 0 -; GCN-DAG: v_writelane_b32 v34, s34, 1 -; GCN-DAG: v_writelane_b32 v34, s35, 2 +; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:12 ; 4-byte Folded Spill +; GCN: buffer_store_dword v33, off, s[0:3], s5 offset:8 ; 4-byte Folded Spill +; GCN: buffer_store_dword v34, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill +; GCN-DAG: v_writelane_b32 v32, s33, 0 +; GCN-DAG: v_writelane_b32 v32, s34, 1 +; GCN-DAG: v_writelane_b32 v32, s35, 2 ; GCN-DAG: s_add_u32 s32, s32, 0x400 ; GCN-DAG: s_getpc_b64 @@ -222,13 +222,13 @@ ; GCN: s_add_u32 s6, s6, sibling_call_i32_fastcc_i32_i32@rel32@lo+4 ; GCN: s_addc_u32 s7, s7, sibling_call_i32_fastcc_i32_i32@rel32@hi+4 -; GCN-DAG: v_readlane_b32 s33, v34, 0 -; GCN-DAG: v_readlane_b32 s34, v34, 1 -; GCN-DAG: v_readlane_b32 s35, v34, 2 +; GCN-DAG: v_readlane_b32 s33, v32, 0 +; GCN-DAG: v_readlane_b32 s34, v32, 1 +; GCN-DAG: v_readlane_b32 s35, v32, 2 -; GCN: buffer_load_dword v33, off, s[0:3], s5 offset:4 -; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:8 -; GCN: buffer_load_dword v34, off, s[0:3], s5 offset:12 +; GCN: buffer_load_dword v34, off, s[0:3], s5 offset:4 +; GCN: buffer_load_dword v33, off, s[0:3], s5 offset:8 +; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:12 ; GCN: s_sub_u32 s32, s32, 0x400 ; GCN: s_setpc_b64 s[6:7] define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i32 %c) #1 { Index: test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll =================================================================== --- test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll +++ test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll @@ -4,13 +4,13 @@ ; storeRegToStackSlot. ; GCN-LABEL: {{^}}spill_csr_s5_copy: -; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Spill +; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill ; GCN: v_writelane_b32 v32, s5, 2 ; GCN: s_swappc_b64 ; GCN: v_readlane_b32 s5, v32, 2 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 9 -; GCN: buffer_store_dword [[K]], off, s[0:3], s5 offset:4 -; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Reload +; GCN: buffer_store_dword [[K]], off, s[0:3], s5 offset:8 +; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload ; GCN: s_setpc_b64 define void @spill_csr_s5_copy() #0 { bb: Index: test/CodeGen/AMDGPU/spill-empty-live-interval.mir =================================================================== --- test/CodeGen/AMDGPU/spill-empty-live-interval.mir +++ test/CodeGen/AMDGPU/spill-empty-live-interval.mir @@ -1,4 +1,4 @@ -# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stress-regalloc=1 -start-before=simple-register-coalescing -stop-after=greedy -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stress-regalloc=1 -start-before=simple-register-coalescing -stop-after=greedy,1 -o - %s | FileCheck %s # https://bugs.llvm.org/show_bug.cgi?id=33620 --- Index: test/CodeGen/AMDGPU/spill-scavenge-offset.ll =================================================================== --- test/CodeGen/AMDGPU/spill-scavenge-offset.ll +++ test/CodeGen/AMDGPU/spill-scavenge-offset.ll @@ -1,5 +1,5 @@ ; RUN: llc -march=amdgcn -mcpu=verde -enable-misched=0 -post-RA-scheduler=0 < %s | FileCheck %s -; RUN: llc -regalloc=basic -march=amdgcn -mcpu=tonga -enable-misched=0 -post-RA-scheduler=0 < %s | FileCheck %s +; RUN: llc -sgpr-regalloc=basic -vgpr-regalloc=basic -march=amdgcn -mcpu=tonga -enable-misched=0 -post-RA-scheduler=0 < %s | FileCheck %s ; ; There is something about Tonga that causes this test to spend a lot of time ; in the default register allocator. Index: test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir =================================================================== --- test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir +++ test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir @@ -3,23 +3,24 @@ # CHECK-LABEL: name: no_merge_sgpr_vgpr_spill_slot{{$}} # CHECK: stack: -# CHECK: - { id: 0, name: '', type: spill-slot, offset: 0, size: 4, alignment: 4, +# CHECK: - { id: 1, name: '', type: spill-slot, offset: 0, size: 4, alignment: 4, # CHECK-NEXT: stack-id: 0, +# CHECK-NOT: id: 1 -# CHECK: - { id: 1, name: '', type: spill-slot, offset: 0, size: 4, alignment: 4, -# CHECK-NEXT: stack-id: 1, +# CHECK: SI_SPILL_V32_SAVE killed $vgpr0, %stack.1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr5, 0, implicit $exec :: (store 4 into %stack.1, addrspace 5) +# CHECK: $vgpr0 = SI_SPILL_V32_RESTORE %stack.1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr5, 0, implicit $exec :: (load 4 from %stack.1, addrspace 5) -# CHECK: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr5, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) -# CHECK: $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr5, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) -# CHECK: SI_SPILL_S32_SAVE killed renamable $sgpr6, %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr5, implicit-def dead $m0 :: (store 4 into %stack.1, addrspace 5) -# CHECK: $sgpr6 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr5, implicit-def dead $m0 :: (load 4 from %stack.1, addrspace 5) + +# CHECK: $vgpr2 = V_WRITELANE_B32_vi killed $sgpr6, 0, $vgpr2 +# CHECK: dead renamable $sgpr6 = S_LOAD_DWORD_IMM undef $sgpr0_sgpr1, 0, 0 +# CHECK: $sgpr6 = V_READLANE_B32_vi $vgpr2, 0 name: no_merge_sgpr_vgpr_spill_slot tracksRegLiveness: true body: | bb.0: - %0:vgpr_32 = FLAT_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, 0, implicit $flat_scr, implicit $exec + %0:vgpr_32 = FLAT_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, 0, implicit $flat_scr, implicit $exec %2:vgpr_32 = FLAT_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, 0, implicit $flat_scr, implicit $exec S_NOP 0, implicit %0 %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef $sgpr0_sgpr1, 0, 0