diff --git a/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h b/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h --- a/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h +++ b/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h @@ -291,6 +291,14 @@ /// This is called after a .mir file was loaded. virtual void mirFileLoaded(MachineFunction &MF) const; + + /// True if the register allocator should use the allocation orders exactly as + /// written in the tablegen descriptions, false if it should allocate + /// the specified physical register later if is it callee-saved. + virtual bool ignoreCSRForAllocationOrder(const MachineFunction &MF, + unsigned PhysReg) const { + return false; + } }; } // end namespace llvm diff --git a/llvm/lib/CodeGen/RegisterClassInfo.cpp b/llvm/lib/CodeGen/RegisterClassInfo.cpp --- a/llvm/lib/CodeGen/RegisterClassInfo.cpp +++ b/llvm/lib/CodeGen/RegisterClassInfo.cpp @@ -90,6 +90,7 @@ void RegisterClassInfo::compute(const TargetRegisterClass *RC) const { assert(RC && "no register class given"); RCInfo &RCI = RegClass[RC->getID()]; + auto &STI = MF->getSubtarget(); // Raw register count, including all reserved regs. unsigned NumRegs = RC->getNumRegs(); @@ -114,7 +115,8 @@ unsigned Cost = TRI->getCostPerUse(PhysReg); MinCost = std::min(MinCost, Cost); - if (CalleeSavedAliases[PhysReg]) + if (CalleeSavedAliases[PhysReg] && + !STI.ignoreCSRForAllocationOrder(*MF, PhysReg)) // PhysReg aliases a CSR, save it for later. CSRAlias.push_back(PhysReg); else { diff --git a/llvm/lib/Target/ARM/ARMRegisterInfo.td b/llvm/lib/Target/ARM/ARMRegisterInfo.td --- a/llvm/lib/Target/ARM/ARMRegisterInfo.td +++ b/llvm/lib/Target/ARM/ARMRegisterInfo.td @@ -227,9 +227,10 @@ // know how to spill them. If we make our prologue/epilogue code smarter at // some point, we can go back to using the above allocation orders for the // Thumb1 instructions that know how to use hi regs. - let AltOrders = [(add LR, GPR), (trunc GPR, 8)]; + let AltOrders = [(add LR, GPR), (trunc GPR, 8), + (add (trunc GPR, 8), R12, LR, (shl GPR, 8))]; let AltOrderSelect = [{ - return 1 + MF.getSubtarget().isThumb1Only(); + return MF.getSubtarget().getGPRAllocationOrder(MF); }]; let DiagnosticString = "operand must be a register in range [r0, r15]"; } @@ -238,9 +239,10 @@ // certain operand slots, particularly as the destination. Primarily // useful for disassembly. def GPRnopc : RegisterClass<"ARM", [i32], 32, (sub GPR, PC)> { - let AltOrders = [(add LR, GPRnopc), (trunc GPRnopc, 8)]; + let AltOrders = [(add LR, GPRnopc), (trunc GPRnopc, 8), + (add (trunc GPRnopc, 8), R12, LR, (shl GPRnopc, 8))]; let AltOrderSelect = [{ - return 1 + MF.getSubtarget().isThumb1Only(); + return MF.getSubtarget().getGPRAllocationOrder(MF); }]; let DiagnosticString = "operand must be a register in range [r0, r14]"; } @@ -295,9 +297,10 @@ // or SP (R13 or R15) are used. The ARM ISA refers to these operands // via the BadReg() pseudo-code description. def rGPR : RegisterClass<"ARM", [i32], 32, (sub GPR, SP, PC)> { - let AltOrders = [(add LR, rGPR), (trunc rGPR, 8)]; + let AltOrders = [(add LR, rGPR), (trunc rGPR, 8), + (add (trunc rGPR, 8), R12, LR, (shl rGPR, 8))]; let AltOrderSelect = [{ - return 1 + MF.getSubtarget().isThumb1Only(); + return MF.getSubtarget().getGPRAllocationOrder(MF); }]; let DiagnosticType = "rGPR"; } diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h --- a/llvm/lib/Target/ARM/ARMSubtarget.h +++ b/llvm/lib/Target/ARM/ARMSubtarget.h @@ -856,6 +856,10 @@ unsigned getPrefLoopAlignment() const { return PrefLoopAlignment; } + + bool ignoreCSRForAllocationOrder(const MachineFunction &MF, + unsigned PhysReg) const override; + unsigned getGPRAllocationOrder(const MachineFunction &MF) const; }; } // end namespace llvm diff --git a/llvm/lib/Target/ARM/ARMSubtarget.cpp b/llvm/lib/Target/ARM/ARMSubtarget.cpp --- a/llvm/lib/Target/ARM/ARMSubtarget.cpp +++ b/llvm/lib/Target/ARM/ARMSubtarget.cpp @@ -413,3 +413,45 @@ ((isTargetMachO() && !isThumb1Only()) || (isTargetLinux() && !isThumb()) || (isTargetNaCl() && !isThumb())); } + +unsigned ARMSubtarget::getGPRAllocationOrder(const MachineFunction &MF) const { + // The GPR register class has multiple possible allocation orders, with + // tradeoffs preferred by different sub-architectures and optimisation goals. + // The allocation orders are: + // 0: (the default tablegen order, not used) + // 1: r14, r0-r13 + // 2: r0-r7 + // 3: r0-r7, r12, lr, r8-r11 + // Note that the register allocator will change this order so that + // callee-saved registers are used later, as they require extra work in the + // prologue/epilogue (though we sometimes override that). + + // For thumb1-only targets, only the low registers are allocatable. + if (isThumb1Only()) + return 2; + + // Allocate low registers first, so we can select more 16-bit instructions. + // We also (in ignoreCSRForAllocationOrder) override the default behaviour + // with regards to callee-saved registers, because pushing extra registers is + // much cheaper (in terms of code size) than using high registers. After + // that, we allocate r12 (doesn't need to be saved), lr (saving it means we + // can return with the pop, don't need an extra "bx lr") and then the rest of + // the high registers. + if (isThumb2() && MF.getFunction().hasMinSize()) + return 3; + + // Otherwise, allocate in the default order, using LR first because saving it + // allows a shorter epilogue sequence. + return 1; +} + +bool ARMSubtarget::ignoreCSRForAllocationOrder(const MachineFunction &MF, + unsigned PhysReg) const { + // To minimize code size in Thumb2, we prefer the usage of low regs (lower + // cost per use) so we can use narrow encoding. By default, caller-saved + // registers (e.g. lr, r12) are always allocated first, regardless of + // their cost per use. When optForMinSize, we prefer the low regs even if + // they are CSR because usually push/pop can be folded into existing ones. + return isThumb2() && MF.getFunction().hasMinSize() && + ARM::GPRRegClass.contains(PhysReg); +} diff --git a/llvm/test/CodeGen/ARM/avoid-cpsr-rmw.ll b/llvm/test/CodeGen/ARM/avoid-cpsr-rmw.ll --- a/llvm/test/CodeGen/ARM/avoid-cpsr-rmw.ll +++ b/llvm/test/CodeGen/ARM/avoid-cpsr-rmw.ll @@ -60,7 +60,7 @@ while.body: ; CHECK: while.body -; CHECK: mul r{{[0-9]+}} +; CHECK: muls r{{[0-9]+}} ; CHECK: muls %ptr1.addr.09 = phi i32* [ %add.ptr, %while.body ], [ %ptr1, %entry ] %ptr2.addr.08 = phi i32* [ %incdec.ptr, %while.body ], [ %ptr2, %entry ] diff --git a/llvm/test/CodeGen/ARM/favor-low-reg-for-Osize.ll b/llvm/test/CodeGen/ARM/favor-low-reg-for-Osize.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/ARM/favor-low-reg-for-Osize.ll @@ -0,0 +1,29 @@ +; REQUIRES: asserts +; RUN: llc -debug-only=regalloc < %s 2>%t | FileCheck %s --check-prefix=CHECK +; RUN: FileCheck %s < %t --check-prefix=DEBUG + +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n8:16:32-S64" +target triple = "thumbv7m--linux-gnueabi" + + +; DEBUG: AllocationOrder(GPR) = [ $r0 $r1 $r2 $r3 $r4 $r5 $r6 $r7 $r12 $lr $r8 $r9 $r10 $r11 ] + +define i32 @test_minsize(i32 %x) optsize minsize { +; CHECK-LABEL: test_minsize: +entry: +; CHECK: mov r4, r0 + tail call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3}"() +; CHECK: mov r0, r4 + ret i32 %x +} + +; DEBUG: AllocationOrder(GPR) = [ $r0 $r1 $r2 $r3 $r12 $lr $r4 $r5 $r6 $r7 $r8 $r9 $r10 $r11 ] + +define i32 @test_optsize(i32 %x) optsize { +; CHECK-LABEL: test_optsize: +entry: +; CHECK: mov r12, r0 + tail call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3}"() +; CHECK: mov r0, r12 + ret i32 %x +}