Index: include/llvm/Target/TargetRegisterInfo.h =================================================================== --- include/llvm/Target/TargetRegisterInfo.h +++ include/llvm/Target/TargetRegisterInfo.h @@ -579,6 +579,12 @@ static void dumpReg(unsigned Reg, unsigned SubRegIndex = 0, const TargetRegisterInfo* TRI = nullptr); + /// Returns true if perUseCost weights over CSR/non-CSR when computing + // the preferred allocation order. + virtual bool costPerUseOverridesCSR(const MachineFunction &MF) const { + return false; + } + protected: /// Overridden by TableGen in targets that have sub-registers. virtual unsigned composeSubRegIndicesImpl(unsigned, unsigned) const { Index: lib/CodeGen/RegisterClassInfo.cpp =================================================================== --- lib/CodeGen/RegisterClassInfo.cpp +++ lib/CodeGen/RegisterClassInfo.cpp @@ -126,6 +126,23 @@ LastCost = Cost; } + if (TRI->costPerUseOverridesCSR(*MF)) { + // If cost per use outweights CSR in deciding allocation order, we reorder + // RCI.order by: 1) costPerUse 2) CSR or not 3) the existing relative + // order (using stable_sort to get this). + std::stable_sort(&RCI.Order[0], &RCI.Order[N], + [&](const MCPhysReg &a, const MCPhysReg &b) -> bool { + auto CostA = TRI->getCostPerUse(a); + auto CostB = TRI->getCostPerUse(b); + if (CostA != CostB) + return CostA < CostB; + // If one is CSR and the other is not, use non-CSR first. + if (!CSRNum[a] != !CSRNum[b]) + return !CSRNum[a]; + return false; + }); + } + // Register allocator stress test. Clip register class to N registers. if (StressRA && RCI.NumRegs > StressRA) RCI.NumRegs = StressRA; Index: lib/Target/ARM/ARMBaseRegisterInfo.h =================================================================== --- lib/Target/ARM/ARMBaseRegisterInfo.h +++ lib/Target/ARM/ARMBaseRegisterInfo.h @@ -205,6 +205,8 @@ const TargetRegisterClass *DstRC, unsigned DstSubReg, const TargetRegisterClass *NewRC) const override; + + bool costPerUseOverridesCSR(const MachineFunction &MF) const override; }; } // end namespace llvm Index: lib/Target/ARM/ARMBaseRegisterInfo.cpp =================================================================== --- lib/Target/ARM/ARMBaseRegisterInfo.cpp +++ lib/Target/ARM/ARMBaseRegisterInfo.cpp @@ -55,6 +55,9 @@ using namespace llvm; +static cl::opt FavorLowRegOverR12LR("arm-favor-r4-r7", cl::Hidden, + cl::desc("favor r4-r7 over r12, lr")); + ARMBaseRegisterInfo::ARMBaseRegisterInfo() : ARMGenRegisterInfo(ARM::LR, 0, 0, ARM::PC) {} @@ -848,3 +851,16 @@ } return false; } + +bool ARMBaseRegisterInfo::costPerUseOverridesCSR( + const MachineFunction &MF) const { + // To minimize code size in Thumb2, we prefer the usage of low regs (lower + // cost per use) so we can use narrow encoding. By default, caller-saved + // registers (e.g. lr, r12) are always allocated first, regardless of + // their cost per use. When optForMinSize, we prefer the low regs even if + // they are CSR because usually push/pop can be folded into existing ones. + return MF.getSubtarget().isThumb2() && + (FavorLowRegOverR12LR.getNumOccurrences() + ? FavorLowRegOverR12LR + : MF.getFunction()->optForMinSize()); +} Index: test/CodeGen/ARM/favor-low-reg-for-Osize.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/favor-low-reg-for-Osize.ll @@ -0,0 +1,68 @@ +; REQUIRES: asserts +; RUN: llc -debug-only=regalloc < %s 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=LowRegFirst +; RUN: llc -debug-only=regalloc -arm-favor-r4-r7=false < %s 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=CallerSavedRegFirst + +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n8:16:32-S64" +target triple = "thumbv7m--linux-gnueabi" + +%struct.ContextType = type { i8*, i8*, i8*, i8*, i32 } +;CHECK-LowRegFirst: AllocationOrder(GPR) = [ %R0 %R1 %R2 %R3 %R4 %R5 %R6 %R7 %R12 %LR %R8 %R9 %R10 %R11 ] +;CHECK-CallerSavedRegFirst: AllocationOrder(GPR) = [ %R0 %R1 %R2 %R3 %R12 %LR %R4 %R5 %R6 %R7 %R8 %R9 %R10 %R11 ] + +; Function Attrs: minsize norecurse nounwind optsize +define void @test(%struct.ContextType* nocapture readonly %pContext) local_unnamed_addr #0 { +entry: +;CHECK-LABEL: test + +;CHECK-LowRegFirst-NOT: ldr{{.*}}.w +;CHECK-LowRegFirst-NOT: str{{.*}}.w + +;CHECK-CallerSavedRegFirst: ldr{{.*}}.w lr, +;CHECK-CallerSavedRegFirst: ldr{{.*}}.w r12 +;CHECK-CallerSavedRegFirst: str{{.*}}.w r12 + + %Src1 = getelementptr inbounds %struct.ContextType, %struct.ContextType* %pContext, i32 0, i32 0 + %0 = load i8*, i8** %Src1, align 4 + %Len = getelementptr inbounds %struct.ContextType, %struct.ContextType* %pContext, i32 0, i32 4 + %1 = load i32, i32* %Len, align 4 + %Dst = getelementptr inbounds %struct.ContextType, %struct.ContextType* %pContext, i32 0, i32 2 + %2 = load i8*, i8** %Dst, align 4 + %arrayidx = getelementptr inbounds i8, i8* %0, i32 24 + %3 = bitcast i8* %arrayidx to i32* + %4 = load i32, i32* %3, align 4 + %arrayidx2 = getelementptr inbounds i8, i8* %0, i32 18 + %5 = bitcast i8* %arrayidx2 to i16* + %6 = load i16, i16* %5, align 2 + %arrayidx3 = getelementptr inbounds i8, i8* %0, i32 16 + %7 = bitcast i8* %arrayidx3 to i16* + %8 = load i16, i16* %7, align 2 + %arrayidx4 = getelementptr inbounds i8, i8* %0, i32 12 + %9 = bitcast i8* %arrayidx4 to i32* + %10 = load i32, i32* %9, align 4 + %arrayidx5 = getelementptr inbounds i8, i8* %0, i32 2 + %11 = bitcast i8* %arrayidx5 to i16* + %12 = load i16, i16* %11, align 2 + %add = add i32 %1, 24 + %arrayidx6 = getelementptr inbounds i8, i8* %2, i32 %add + %13 = bitcast i8* %arrayidx6 to i32* + store i32 %4, i32* %13, align 4 + %add7 = add i32 %1, 18 + %arrayidx8 = getelementptr inbounds i8, i8* %2, i32 %add7 + %14 = bitcast i8* %arrayidx8 to i16* + store i16 %6, i16* %14, align 2 + %add9 = add i32 %1, 16 + %arrayidx10 = getelementptr inbounds i8, i8* %2, i32 %add9 + %15 = bitcast i8* %arrayidx10 to i16* + store i16 %8, i16* %15, align 2 + %add11 = add i32 %1, 12 + %arrayidx12 = getelementptr inbounds i8, i8* %2, i32 %add11 + %16 = bitcast i8* %arrayidx12 to i32* + store i32 %10, i32* %16, align 4 + %add13 = add i32 %1, 2 + %arrayidx14 = getelementptr inbounds i8, i8* %2, i32 %add13 + %17 = bitcast i8* %arrayidx14 to i16* + store i16 %12, i16* %17, align 2 + ret void +} + +attributes #0 = { minsize norecurse nounwind optsize "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m3" "target-features"="+hwdiv" "unsafe-fp-math"="false" "use-soft-float"="false" } Index: test/CodeGen/ARM/fold-stack-adjust.ll =================================================================== --- test/CodeGen/ARM/fold-stack-adjust.ll +++ test/CodeGen/ARM/fold-stack-adjust.ll @@ -1,9 +1,9 @@ ; Disable shrink-wrapping on the first test otherwise we wouldn't ; exerce the path for PR18136. -; RUN: llc -mtriple=thumbv7-apple-none-macho < %s -enable-shrink-wrap=false | FileCheck %s +; RUN: llc -mtriple=thumbv7-apple-none-macho -arm-favor-r4-r7=false < %s -enable-shrink-wrap=false | FileCheck %s ; RUN: llc -mtriple=thumbv6m-apple-none-macho -disable-fp-elim < %s | FileCheck %s --check-prefix=CHECK-T1 -; RUN: llc -mtriple=thumbv7-apple-darwin-ios -disable-fp-elim < %s | FileCheck %s --check-prefix=CHECK-IOS -; RUN: llc -mtriple=thumbv7--linux-gnueabi -disable-fp-elim < %s | FileCheck %s --check-prefix=CHECK-LINUX +; RUN: llc -mtriple=thumbv7-apple-darwin-ios -disable-fp-elim -arm-favor-r4-r7=false < %s | FileCheck %s --check-prefix=CHECK-IOS +; RUN: llc -mtriple=thumbv7--linux-gnueabi -disable-fp-elim -arm-favor-r4-r7=false < %s | FileCheck %s --check-prefix=CHECK-LINUX declare void @bar(i8*)