diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h --- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h +++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h @@ -55,6 +55,8 @@ const LaneBitmask LaneMask; /// Classes with a higher priority value are assigned first by register /// allocators using a greedy heuristic. The value is in the range [0,63]. + /// Values >= 32 should be used with care since they may overlap with other + /// fields in the allocator's priority heuristics. const uint8_t AllocationPriority; /// Configurable target specific flags. const uint8_t TSFlags; @@ -1076,6 +1078,14 @@ return false; } + /// When prioritizing live ranges in register allocation, if this hook returns + /// true then the AllocationPriority of the register class will be treated as + /// more important than whether the range is local to a basic block or global. + virtual bool + regClassPriorityTrumpsGlobalness(const MachineFunction &MF) const { + return false; + } + //===--------------------------------------------------------------------===// /// Debug information queries. diff --git a/llvm/include/llvm/Target/Target.td b/llvm/include/llvm/Target/Target.td --- a/llvm/include/llvm/Target/Target.td +++ b/llvm/include/llvm/Target/Target.td @@ -279,6 +279,8 @@ // heuristic. Classes with higher priority values are assigned first. This is // useful as it is sometimes beneficial to assign registers to highly // constrained classes first. The value has to be in the range [0,63]. + // Values >= 32 should be used with care since they may overlap with other + // fields in the allocator's priority heuristics. int AllocationPriority = 0; // Generate register pressure set for this register class and any class diff --git a/llvm/lib/CodeGen/RegAllocGreedy.h b/llvm/lib/CodeGen/RegAllocGreedy.h --- a/llvm/lib/CodeGen/RegAllocGreedy.h +++ b/llvm/lib/CodeGen/RegAllocGreedy.h @@ -322,6 +322,10 @@ /// Function ArrayRef RegCosts; + /// Flags for the live range priority calculation, determined once per + /// machine function. + bool RegClassPriorityTrumpsGlobalness; + public: RAGreedy(const RegClassFilterFunc F = allocateAllRegClasses); diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp --- a/llvm/lib/CodeGen/RegAllocGreedy.cpp +++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp @@ -128,6 +128,13 @@ "limit its budget and bail out once we reach the limit."), cl::init(10000), cl::Hidden); +static cl::opt GreedyRegClassPriorityTrumpsGlobalness( + "greedy-regclass-priority-trumps-globalness", + cl::desc("Change the greedy register allocator's live range priority " + "calculation to make the AllocationPriority of the register class " + "more important then whether the range is global"), + cl::Hidden); + static RegisterRegAlloc greedyRegAlloc("greedy", "greedy register allocator", createGreedyRegisterAllocator); @@ -305,6 +312,7 @@ const TargetRegisterClass &RC = *MRI->getRegClass(Reg); bool ForceGlobal = !ReverseLocal && (Size / SlotIndex::InstrDist) > (2 * RCI.getNumAllocatableRegs(&RC)); + unsigned GlobalBit = 0; if (Stage == RS_Assign && !ForceGlobal && !LI->empty() && LIS->intervalIsInOneMBB(*LI)) { @@ -323,9 +331,13 @@ // Allocate global and split ranges in long->short order. Long ranges that // don't fit should be spilled (or split) ASAP so they don't create // interference. Mark a bit to prioritize global above local ranges. - Prio = (1u << 29) + Size; + Prio = Size; + GlobalBit = 1; } - Prio |= RC.AllocationPriority << 24; + if (RegClassPriorityTrumpsGlobalness) + Prio |= RC.AllocationPriority << 25 | GlobalBit << 24; + else + Prio |= GlobalBit << 29 | RC.AllocationPriority << 24; // Mark a higher bit to prioritize global and local above RS_Split. Prio |= (1u << 31); @@ -2692,6 +2704,10 @@ initializeCSRCost(); RegCosts = TRI->getRegisterCosts(*MF); + RegClassPriorityTrumpsGlobalness = + GreedyRegClassPriorityTrumpsGlobalness.getNumOccurrences() + ? GreedyRegClassPriorityTrumpsGlobalness + : TRI->regClassPriorityTrumpsGlobalness(*MF); ExtraInfo.emplace(); EvictAdvisor = diff --git a/llvm/test/CodeGen/AMDGPU/greedy-liverange-priority.mir b/llvm/test/CodeGen/AMDGPU/greedy-liverange-priority.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/greedy-liverange-priority.mir @@ -0,0 +1,48 @@ +# RUN: llc -march=amdgcn -mcpu=gfx1030 -greedy-regclass-priority-trumps-globalness=0 -start-before greedy -o - %s | FileCheck %s -check-prefix=OLD +# RUN: llc -march=amdgcn -mcpu=gfx1030 -greedy-regclass-priority-trumps-globalness=1 -start-before greedy -o - %s | FileCheck %s -check-prefix=NEW + +# At the time of writing -greedy-regclass-priority-trumps-globalness makes a +# significant improvement in the total number of vgprs needed to compile this +# test, from 11 down to 7. + +# OLD: NumVgprs: 11{{$}} +# NEW: NumVgprs: 7{{$}} + +--- +name: _amdgpu_cs_main +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.1, %bb.2 + liveins: $vgpr0, $vgpr6 + + %6:vgpr_32 = COPY $vgpr6 + undef %30.sub0:vreg_128 = COPY $vgpr0 + undef %27.sub0:vreg_128 = V_MED3_F32_e64 0, 0, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + undef %16.sub0:sgpr_256 = S_MOV_B32 0 + undef %26.sub1:vreg_64 = V_LSHRREV_B32_e32 1, %6, implicit $exec + %27.sub1:vreg_128 = COPY %27.sub0 + %27.sub2:vreg_128 = COPY %27.sub0 + %27.sub3:vreg_128 = COPY %27.sub0 + %26.sub0:vreg_64 = V_MOV_B32_e32 1, implicit $exec + %16.sub1:sgpr_256 = COPY %16.sub0 + %16.sub2:sgpr_256 = COPY %16.sub0 + %16.sub3:sgpr_256 = COPY %16.sub0 + %16.sub4:sgpr_256 = COPY %16.sub0 + %16.sub5:sgpr_256 = COPY %16.sub0 + %16.sub6:sgpr_256 = COPY %16.sub0 + %16.sub7:sgpr_256 = COPY %16.sub0 + IMAGE_STORE_V4_V2_gfx10 %27, %26, %16, 0, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into custom "ImageResource") + S_CBRANCH_SCC1 %bb.2, implicit undef $scc + S_BRANCH %bb.1 + + bb.1: + %30.sub1:vreg_128 = V_MOV_B32_e32 0, implicit $exec + %30.sub2:vreg_128 = COPY %30.sub1 + %30.sub3:vreg_128 = COPY %30.sub1 + %26.sub1:vreg_64 = COPY %30.sub1 + IMAGE_STORE_V4_V2_gfx10 %30, %26, %16, 0, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into custom "ImageResource") + + bb.2: + S_ENDPGM 0 +...