diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h --- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h +++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h @@ -351,6 +351,13 @@ BitVector getAllocatableSet(const MachineFunction &MF, const TargetRegisterClass *RC = nullptr) const; + /// Returns a list of bitsets each indexed by register number indicating if a + /// register is allocatable or not. One bitset is computed for each provided + /// register class. + SmallVector + getAllocatableSets(const MachineFunction &MF, + SmallVectorImpl &RCs) const; + /// Get a list of cost values for all registers that correspond to the index /// returned by RegisterCostTableIndex. ArrayRef getRegisterCosts(const MachineFunction &MF) const { diff --git a/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/llvm/lib/CodeGen/TargetRegisterInfo.cpp --- a/llvm/lib/CodeGen/TargetRegisterInfo.cpp +++ b/llvm/lib/CodeGen/TargetRegisterInfo.cpp @@ -273,6 +273,24 @@ return Allocatable; } +SmallVector TargetRegisterInfo::getAllocatableSets( + const MachineFunction &MF, + SmallVectorImpl &RCs) const { + SmallVector AllocatableSets; + BitVector NotReserved = getReservedRegs(MF).flip(); + for (const TargetRegisterClass *RC : RCs) { + BitVector &Allocatable = AllocatableSets.emplace_back(getNumRegs()); + // A register class with no allocatable subclass returns an empty set. + const TargetRegisterClass *SubClass = getAllocatableClass(RC); + if (SubClass) + getAllocatableSetForRC(MF, SubClass, Allocatable); + // Mask out the reserved registers + Allocatable &= NotReserved; + } + + return AllocatableSets; +} + static inline const TargetRegisterClass *firstCommonClass(const uint32_t *A, const uint32_t *B, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -1169,7 +1169,11 @@ if (OptExecMaskPreRA) insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID); - insertPass(&MachineSchedulerID, &SIFormMemoryClausesID); + + // This is not an essential optimization and it has a noticeable impact on + // compilation time, so we only enable it from O2. + if (TM->getOptLevel() > CodeGenOpt::Less) + insertPass(&MachineSchedulerID, &SIFormMemoryClausesID); // This must be run immediately after phi elimination and before // TwoAddressInstructions, otherwise the processing of the tied operand of diff --git a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp --- a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp +++ b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp @@ -273,8 +273,12 @@ SlotIndexes *Ind = LIS->getSlotIndexes(); bool Changed = false; - MaxVGPRs = TRI->getAllocatableSet(MF, &AMDGPU::VGPR_32RegClass).count(); - MaxSGPRs = TRI->getAllocatableSet(MF, &AMDGPU::SGPR_32RegClass).count(); + SmallVector RCs( + {&AMDGPU::VGPR_32RegClass, &AMDGPU::SGPR_32RegClass}); + SmallVector AllocatableSets = TRI->getAllocatableSets(MF, RCs); + MaxVGPRs = AllocatableSets[0].count(); + MaxSGPRs = AllocatableSets[1].count(); + unsigned FuncMaxClause = AMDGPU::getIntegerAttribute( MF.getFunction(), "amdgpu-max-memory-clause", MaxClause); diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -349,7 +349,6 @@ ; GCN-O1-NEXT: Live Register Matrix ; GCN-O1-NEXT: SI Pre-allocate WWM Registers ; GCN-O1-NEXT: SI optimize exec mask operations pre-RA -; GCN-O1-NEXT: SI Form memory clauses ; GCN-O1-NEXT: Machine Natural Loop Construction ; GCN-O1-NEXT: Machine Block Frequency Analysis ; GCN-O1-NEXT: Debug Variable Analysis @@ -632,7 +631,6 @@ ; GCN-O1-OPTS-NEXT: Live Register Matrix ; GCN-O1-OPTS-NEXT: SI Pre-allocate WWM Registers ; GCN-O1-OPTS-NEXT: SI optimize exec mask operations pre-RA -; GCN-O1-OPTS-NEXT: SI Form memory clauses ; GCN-O1-OPTS-NEXT: Machine Natural Loop Construction ; GCN-O1-OPTS-NEXT: Machine Block Frequency Analysis ; GCN-O1-OPTS-NEXT: Debug Variable Analysis