Index: include/llvm/Target/TargetRegisterInfo.h =================================================================== --- include/llvm/Target/TargetRegisterInfo.h +++ include/llvm/Target/TargetRegisterInfo.h @@ -582,6 +582,12 @@ static void dumpReg(unsigned Reg, unsigned SubRegIndex = 0, const TargetRegisterInfo* TRI = nullptr); + /// Returns true to ignore the preferring of CSR when computing the preferred + /// register allocation order. + virtual bool ignoreCSRForAllocationOrder(const MachineFunction &MF) const { + return false; + } + protected: /// Overridden by TableGen in targets that have sub-registers. virtual unsigned composeSubRegIndicesImpl(unsigned, unsigned) const { Index: lib/CodeGen/RegisterClassInfo.cpp =================================================================== --- lib/CodeGen/RegisterClassInfo.cpp +++ lib/CodeGen/RegisterClassInfo.cpp @@ -114,7 +114,7 @@ unsigned Cost = TRI->getCostPerUse(PhysReg); MinCost = std::min(MinCost, Cost); - if (CalleeSavedAliases[PhysReg]) + if (CalleeSavedAliases[PhysReg] && !TRI->ignoreCSRForAllocationOrder(*MF)) // PhysReg aliases a CSR, save it for later. CSRAlias.push_back(PhysReg); else { Index: lib/Target/ARM/ARMBaseRegisterInfo.h =================================================================== --- lib/Target/ARM/ARMBaseRegisterInfo.h +++ lib/Target/ARM/ARMBaseRegisterInfo.h @@ -205,6 +205,8 @@ const TargetRegisterClass *DstRC, unsigned DstSubReg, const TargetRegisterClass *NewRC) const override; + + bool ignoreCSRForAllocationOrder(const MachineFunction &MF) const override; }; } // end namespace llvm Index: lib/Target/ARM/ARMBaseRegisterInfo.cpp =================================================================== --- lib/Target/ARM/ARMBaseRegisterInfo.cpp +++ lib/Target/ARM/ARMBaseRegisterInfo.cpp @@ -848,3 +848,13 @@ } return false; } + +bool ARMBaseRegisterInfo::ignoreCSRForAllocationOrder( + const MachineFunction &MF) const { + // To minimize code size in Thumb2, we prefer the usage of low regs (lower + // cost per use) so we can use narrow encoding. By default, caller-saved + // registers (e.g. lr, r12) are always allocated first, regardless of + // their cost per use. When optForMinSize, we prefer the low regs even if + // they are CSR because usually push/pop can be folded into existing ones. + return MF.getSubtarget().favorLowRegOverR12LR(MF); +} Index: lib/Target/ARM/ARMRegisterInfo.td =================================================================== --- lib/Target/ARM/ARMRegisterInfo.td +++ lib/Target/ARM/ARMRegisterInfo.td @@ -197,9 +197,11 @@ // know how to spill them. If we make our prologue/epilogue code smarter at // some point, we can go back to using the above allocation orders for the // Thumb1 instructions that know how to use hi regs. - let AltOrders = [(add LR, GPR), (trunc GPR, 8)]; + let AltOrders = [(add LR, GPR), (trunc GPR, 8), + (add (trunc GPR, 8), R12, LR, (shl GPR, 8))]; let AltOrderSelect = [{ - return 1 + MF.getSubtarget().isThumb1Only(); + return MF.getSubtarget().favorLowRegOverR12LR(MF) ? + 3 : (1 + MF.getSubtarget().isThumb1Only()); }]; } @@ -207,9 +209,11 @@ // certain operand slots, particularly as the destination. Primarily // useful for disassembly. def GPRnopc : RegisterClass<"ARM", [i32], 32, (sub GPR, PC)> { - let AltOrders = [(add LR, GPRnopc), (trunc GPRnopc, 8)]; + let AltOrders = [(add LR, GPRnopc), (trunc GPRnopc, 8), + (add (trunc GPRnopc, 8), R12, LR, (shl GPRnopc, 8))]; let AltOrderSelect = [{ - return 1 + MF.getSubtarget().isThumb1Only(); + return MF.getSubtarget().favorLowRegOverR12LR(MF) ? + 3 : (1 + MF.getSubtarget().isThumb1Only()); }]; } @@ -235,9 +239,11 @@ // or SP (R13 or R15) are used. The ARM ISA refers to these operands // via the BadReg() pseudo-code description. def rGPR : RegisterClass<"ARM", [i32], 32, (sub GPR, SP, PC)> { - let AltOrders = [(add LR, rGPR), (trunc rGPR, 8)]; + let AltOrders = [(add LR, rGPR), (trunc rGPR, 8), + (add (trunc rGPR, 8), R12, LR, (shl rGPR, 8))]; let AltOrderSelect = [{ - return 1 + MF.getSubtarget().isThumb1Only(); + return MF.getSubtarget().favorLowRegOverR12LR(MF) ? + 3 : (1 + MF.getSubtarget().isThumb1Only()); }]; } Index: lib/Target/ARM/ARMSubtarget.h =================================================================== --- lib/Target/ARM/ARMSubtarget.h +++ lib/Target/ARM/ARMSubtarget.h @@ -710,6 +710,8 @@ /// True if fast-isel is used. bool useFastISel() const; + + bool favorLowRegOverR12LR(const MachineFunction &MF) const; }; } // end namespace llvm Index: lib/Target/ARM/ARMSubtarget.cpp =================================================================== --- lib/Target/ARM/ARMSubtarget.cpp +++ lib/Target/ARM/ARMSubtarget.cpp @@ -69,6 +69,9 @@ ForceFastISel("arm-force-fast-isel", cl::init(false), cl::Hidden); +static cl::opt FavorLowRegOverR12LR("arm-favor-r4-r7", cl::Hidden, + cl::desc("favor r4-r7 over r12, lr")); + /// initializeSubtargetDependencies - Initializes using a CPU and feature string /// so that we can use initializer lists for subtarget initialization. ARMSubtarget &ARMSubtarget::initializeSubtargetDependencies(StringRef CPU, @@ -327,6 +330,18 @@ return false; } +bool ARMSubtarget::favorLowRegOverR12LR(const MachineFunction &MF) const { + if (!isThumb2()) + return false; + + if (FavorLowRegOverR12LR.getNumOccurrences()) + return FavorLowRegOverR12LR; + + // Favoring R12 and LR over CSR may have performance penalty, so we enable + // it only when opt for min size. + return MF.getFunction()->optForMinSize(); +} + unsigned ARMSubtarget::getMispredictionPenalty() const { return SchedModel.MispredictPenalty; } Index: test/CodeGen/ARM/favor-low-reg-for-Osize.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/favor-low-reg-for-Osize.ll @@ -0,0 +1,75 @@ +; REQUIRES: asserts +; RUN: llc -debug-only=regalloc < %s 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=LowRegFirst +; RUN: llc -debug-only=regalloc -arm-favor-r4-r7=false < %s 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=CallerSavedRegFirst + +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n8:16:32-S64" +target triple = "thumbv7m--linux-gnueabi" + +;CHECK-LABEL: test + +%struct.ContextType = type { i8*, i8*, i8*, i8*, i32 } +;LowRegFirst: AllocationOrder(GPR) = [ %R0 %R1 %R2 %R3 %R4 %R5 %R6 %R7 %R12 %LR %R8 %R9 %R10 %R11 ] +;LowRegFirst: AllocationOrder(GPRnopc) = [ %R0 %R1 %R2 %R3 %R4 %R5 %R6 %R7 %R12 %LR %R8 %R9 %R10 %R11 ] +;LowRegFirst: AllocationOrder(rGPR) = [ %R0 %R1 %R2 %R3 %R4 %R5 %R6 %R7 %R12 %LR %R8 %R9 %R10 %R11 ] + +;CallerSavedRegFirst: AllocationOrder(GPR) = [ %R0 %R1 %R2 %R3 %R12 %LR %R4 %R5 %R6 %R7 %R8 %R9 %R10 %R11 ] +;CallerSavedRegFirst: AllocationOrder(GPRnopc) = [ %R0 %R1 %R2 %R3 %R12 %LR %R4 %R5 %R6 %R7 %R8 %R9 %R10 %R11 ] +;CallerSavedRegFirst: AllocationOrder(rGPR) = [ %R0 %R1 %R2 %R3 %R12 %LR %R4 %R5 %R6 %R7 %R8 %R9 %R10 %R11 ] + + +; Function Attrs: minsize norecurse nounwind optsize +define void @test(%struct.ContextType* nocapture readonly %pContext) local_unnamed_addr #0 { +entry: + +;LowRegFirst-NOT: ldr{{.*}}.w +;LowRegFirst-NOT: str{{.*}}.w + +;CallerSavedRegFirst: ldr{{.*}}.w +;CallerSavedRegFirst: ldr{{.*}}.w +;CallerSavedRegFirst: str{{.*}}.w + + %Src1 = getelementptr inbounds %struct.ContextType, %struct.ContextType* %pContext, i32 0, i32 0 + %0 = load i8*, i8** %Src1, align 4 + %Len = getelementptr inbounds %struct.ContextType, %struct.ContextType* %pContext, i32 0, i32 4 + %1 = load i32, i32* %Len, align 4 + %Dst = getelementptr inbounds %struct.ContextType, %struct.ContextType* %pContext, i32 0, i32 2 + %2 = load i8*, i8** %Dst, align 4 + %arrayidx = getelementptr inbounds i8, i8* %0, i32 24 + %3 = bitcast i8* %arrayidx to i32* + %4 = load i32, i32* %3, align 4 + %arrayidx2 = getelementptr inbounds i8, i8* %0, i32 18 + %5 = bitcast i8* %arrayidx2 to i16* + %6 = load i16, i16* %5, align 2 + %arrayidx3 = getelementptr inbounds i8, i8* %0, i32 16 + %7 = bitcast i8* %arrayidx3 to i16* + %8 = load i16, i16* %7, align 2 + %arrayidx4 = getelementptr inbounds i8, i8* %0, i32 12 + %9 = bitcast i8* %arrayidx4 to i32* + %10 = load i32, i32* %9, align 4 + %arrayidx5 = getelementptr inbounds i8, i8* %0, i32 2 + %11 = bitcast i8* %arrayidx5 to i16* + %12 = load i16, i16* %11, align 2 + %add = add i32 %1, 24 + %arrayidx6 = getelementptr inbounds i8, i8* %2, i32 %add + %13 = bitcast i8* %arrayidx6 to i32* + store i32 %4, i32* %13, align 4 + %add7 = add i32 %1, 18 + %arrayidx8 = getelementptr inbounds i8, i8* %2, i32 %add7 + %14 = bitcast i8* %arrayidx8 to i16* + store i16 %6, i16* %14, align 2 + %add9 = add i32 %1, 16 + %arrayidx10 = getelementptr inbounds i8, i8* %2, i32 %add9 + %15 = bitcast i8* %arrayidx10 to i16* + store i16 %8, i16* %15, align 2 + %add11 = add i32 %1, 12 + %arrayidx12 = getelementptr inbounds i8, i8* %2, i32 %add11 + %16 = bitcast i8* %arrayidx12 to i32* + store i32 %10, i32* %16, align 4 + %add13 = add i32 %1, 2 + %arrayidx14 = getelementptr inbounds i8, i8* %2, i32 %add13 + %17 = bitcast i8* %arrayidx14 to i16* + store i16 %12, i16* %17, align 2 + ret void +} + +attributes #0 = { minsize norecurse nounwind optsize "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m3" "target-features"="+hwdiv" "unsafe-fp-math"="false" "use-soft-float"="false" } Index: test/CodeGen/ARM/fold-stack-adjust.ll =================================================================== --- test/CodeGen/ARM/fold-stack-adjust.ll +++ test/CodeGen/ARM/fold-stack-adjust.ll @@ -1,9 +1,9 @@ ; Disable shrink-wrapping on the first test otherwise we wouldn't ; exerce the path for PR18136. -; RUN: llc -mtriple=thumbv7-apple-none-macho < %s -enable-shrink-wrap=false | FileCheck %s +; RUN: llc -mtriple=thumbv7-apple-none-macho -arm-favor-r4-r7=false < %s -enable-shrink-wrap=false | FileCheck %s ; RUN: llc -mtriple=thumbv6m-apple-none-macho -disable-fp-elim < %s | FileCheck %s --check-prefix=CHECK-T1 -; RUN: llc -mtriple=thumbv7-apple-darwin-ios -disable-fp-elim < %s | FileCheck %s --check-prefix=CHECK-IOS -; RUN: llc -mtriple=thumbv7--linux-gnueabi -disable-fp-elim < %s | FileCheck %s --check-prefix=CHECK-LINUX +; RUN: llc -mtriple=thumbv7-apple-darwin-ios -disable-fp-elim -arm-favor-r4-r7=false < %s | FileCheck %s --check-prefix=CHECK-IOS +; RUN: llc -mtriple=thumbv7--linux-gnueabi -disable-fp-elim -arm-favor-r4-r7=false < %s | FileCheck %s --check-prefix=CHECK-LINUX declare void @bar(i8*)