Index: include/llvm/Target/TargetRegisterInfo.h
===================================================================
--- include/llvm/Target/TargetRegisterInfo.h
+++ include/llvm/Target/TargetRegisterInfo.h
@@ -579,6 +579,12 @@
   static void dumpReg(unsigned Reg, unsigned SubRegIndex = 0,
                       const TargetRegisterInfo* TRI = nullptr);
 
+  /// Returns true if perUseCost weights over CSR/non-CSR when computing
+  //  the preferred allocation order.
+  virtual bool costPerUseOverridesCSR(const MachineFunction &MF) const {
+    return false;
+  }
+
 protected:
   /// Overridden by TableGen in targets that have sub-registers.
   virtual unsigned composeSubRegIndicesImpl(unsigned, unsigned) const {
Index: lib/CodeGen/RegisterClassInfo.cpp
===================================================================
--- lib/CodeGen/RegisterClassInfo.cpp
+++ lib/CodeGen/RegisterClassInfo.cpp
@@ -126,6 +126,23 @@
     LastCost = Cost;
   }
 
+  if (TRI->costPerUseOverridesCSR(*MF)) {
+    // If cost per use outweights CSR in deciding allocation order, we reorder
+    // RCI.order by: 1) costPerUse 2) CSR or not 3) the existing relative
+    // order (using stable_sort to get this).
+    std::stable_sort(&RCI.Order[0], &RCI.Order[N],
+                     [&](const MCPhysReg &a, const MCPhysReg &b) -> bool {
+                       auto CostA = TRI->getCostPerUse(a);
+                       auto CostB = TRI->getCostPerUse(b);
+                       if (CostA != CostB)
+                         return CostA < CostB;
+                       // If one is CSR and the other is not, use non-CSR first.
+                       if (!CSRNum[a] != !CSRNum[b])
+                         return !CSRNum[a];
+                       return false;
+                     });
+  }
+
   // Register allocator stress test.  Clip register class to N registers.
   if (StressRA && RCI.NumRegs > StressRA)
     RCI.NumRegs = StressRA;
Index: lib/Target/ARM/ARMBaseRegisterInfo.h
===================================================================
--- lib/Target/ARM/ARMBaseRegisterInfo.h
+++ lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -205,6 +205,8 @@
                       const TargetRegisterClass *DstRC,
                       unsigned DstSubReg,
                       const TargetRegisterClass *NewRC) const override;
+
+  bool costPerUseOverridesCSR(const MachineFunction &MF) const override;
 };
 
 } // end namespace llvm
Index: lib/Target/ARM/ARMBaseRegisterInfo.cpp
===================================================================
--- lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -55,6 +55,9 @@
 
 using namespace llvm;
 
+static cl::opt<bool> FavorLowRegOverR12LR("arm-favor-r4-r7", cl::Hidden,
+                                          cl::desc("favor r4-r7 over r12, lr"));
+
 ARMBaseRegisterInfo::ARMBaseRegisterInfo()
     : ARMGenRegisterInfo(ARM::LR, 0, 0, ARM::PC) {}
 
@@ -848,3 +851,16 @@
   }
   return false;
 }
+
+bool ARMBaseRegisterInfo::costPerUseOverridesCSR(
+    const MachineFunction &MF) const {
+  // To minimize code size in Thumb2, we prefer the usage of low regs (lower
+  // cost per use) so we can  use narrow encoding. By default, caller-saved
+  // registers (e.g. lr, r12) are always  allocated first, regardless of
+  // their cost per use. When optForMinSize, we prefer the low regs even if
+  // they are CSR because usually push/pop can be folded into existing ones.
+  return MF.getSubtarget<ARMSubtarget>().isThumb2() &&
+         (FavorLowRegOverR12LR.getNumOccurrences()
+              ? FavorLowRegOverR12LR
+              : MF.getFunction()->optForMinSize());
+}
Index: test/CodeGen/ARM/favor-low-reg-for-Osize.ll
===================================================================
--- /dev/null
+++ test/CodeGen/ARM/favor-low-reg-for-Osize.ll
@@ -0,0 +1,68 @@
+; REQUIRES: asserts
+; RUN: llc -debug-only=regalloc < %s 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=LowRegFirst
+; RUN: llc -debug-only=regalloc -arm-favor-r4-r7=false  < %s 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=CallerSavedRegFirst
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n8:16:32-S64"
+target triple = "thumbv7m--linux-gnueabi"
+
+%struct.ContextType = type { i8*, i8*, i8*, i8*, i32 }
+;CHECK-LowRegFirst:         AllocationOrder(GPR) = [ %R0 %R1 %R2 %R3 %R4 %R5 %R6 %R7 %R12 %LR %R8 %R9 %R10 %R11 ]
+;CHECK-CallerSavedRegFirst: AllocationOrder(GPR) = [ %R0 %R1 %R2 %R3 %R12 %LR %R4 %R5 %R6 %R7 %R8 %R9 %R10 %R11 ]
+
+; Function Attrs: minsize norecurse nounwind optsize
+define void @test(%struct.ContextType* nocapture readonly %pContext) local_unnamed_addr #0 {
+entry:
+;CHECK-LABEL: test
+
+;CHECK-LowRegFirst-NOT: ldr{{.*}}.w
+;CHECK-LowRegFirst-NOT: str{{.*}}.w
+
+;CHECK-CallerSavedRegFirst: ldr{{.*}}.w lr,
+;CHECK-CallerSavedRegFirst: ldr{{.*}}.w r12
+;CHECK-CallerSavedRegFirst: str{{.*}}.w r12
+
+  %Src1 = getelementptr inbounds %struct.ContextType, %struct.ContextType* %pContext, i32 0, i32 0
+  %0 = load i8*, i8** %Src1, align 4
+  %Len = getelementptr inbounds %struct.ContextType, %struct.ContextType* %pContext, i32 0, i32 4
+  %1 = load i32, i32* %Len, align 4
+  %Dst = getelementptr inbounds %struct.ContextType, %struct.ContextType* %pContext, i32 0, i32 2
+  %2 = load i8*, i8** %Dst, align 4
+  %arrayidx = getelementptr inbounds i8, i8* %0, i32 24
+  %3 = bitcast i8* %arrayidx to i32*
+  %4 = load i32, i32* %3, align 4
+  %arrayidx2 = getelementptr inbounds i8, i8* %0, i32 18
+  %5 = bitcast i8* %arrayidx2 to i16*
+  %6 = load i16, i16* %5, align 2
+  %arrayidx3 = getelementptr inbounds i8, i8* %0, i32 16
+  %7 = bitcast i8* %arrayidx3 to i16*
+  %8 = load i16, i16* %7, align 2
+  %arrayidx4 = getelementptr inbounds i8, i8* %0, i32 12
+  %9 = bitcast i8* %arrayidx4 to i32*
+  %10 = load i32, i32* %9, align 4
+  %arrayidx5 = getelementptr inbounds i8, i8* %0, i32 2
+  %11 = bitcast i8* %arrayidx5 to i16*
+  %12 = load i16, i16* %11, align 2
+  %add = add i32 %1, 24
+  %arrayidx6 = getelementptr inbounds i8, i8* %2, i32 %add
+  %13 = bitcast i8* %arrayidx6 to i32*
+  store i32 %4, i32* %13, align 4
+  %add7 = add i32 %1, 18
+  %arrayidx8 = getelementptr inbounds i8, i8* %2, i32 %add7
+  %14 = bitcast i8* %arrayidx8 to i16*
+  store i16 %6, i16* %14, align 2
+  %add9 = add i32 %1, 16
+  %arrayidx10 = getelementptr inbounds i8, i8* %2, i32 %add9
+  %15 = bitcast i8* %arrayidx10 to i16*
+  store i16 %8, i16* %15, align 2
+  %add11 = add i32 %1, 12
+  %arrayidx12 = getelementptr inbounds i8, i8* %2, i32 %add11
+  %16 = bitcast i8* %arrayidx12 to i32*
+  store i32 %10, i32* %16, align 4
+  %add13 = add i32 %1, 2
+  %arrayidx14 = getelementptr inbounds i8, i8* %2, i32 %add13
+  %17 = bitcast i8* %arrayidx14 to i16*
+  store i16 %12, i16* %17, align 2
+  ret void
+}
+
+attributes #0 = { minsize norecurse nounwind optsize "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m3" "target-features"="+hwdiv" "unsafe-fp-math"="false" "use-soft-float"="false" }
Index: test/CodeGen/ARM/fold-stack-adjust.ll
===================================================================
--- test/CodeGen/ARM/fold-stack-adjust.ll
+++ test/CodeGen/ARM/fold-stack-adjust.ll
@@ -1,9 +1,9 @@
 ; Disable shrink-wrapping on the first test otherwise we wouldn't
 ; exerce the path for PR18136.
-; RUN: llc -mtriple=thumbv7-apple-none-macho < %s -enable-shrink-wrap=false | FileCheck %s
+; RUN: llc -mtriple=thumbv7-apple-none-macho -arm-favor-r4-r7=false < %s -enable-shrink-wrap=false | FileCheck %s
 ; RUN: llc -mtriple=thumbv6m-apple-none-macho -disable-fp-elim < %s | FileCheck %s --check-prefix=CHECK-T1
-; RUN: llc -mtriple=thumbv7-apple-darwin-ios -disable-fp-elim < %s | FileCheck %s --check-prefix=CHECK-IOS
-; RUN: llc -mtriple=thumbv7--linux-gnueabi -disable-fp-elim < %s | FileCheck %s --check-prefix=CHECK-LINUX
+; RUN: llc -mtriple=thumbv7-apple-darwin-ios -disable-fp-elim -arm-favor-r4-r7=false < %s | FileCheck %s --check-prefix=CHECK-IOS
+; RUN: llc -mtriple=thumbv7--linux-gnueabi -disable-fp-elim -arm-favor-r4-r7=false < %s | FileCheck %s --check-prefix=CHECK-LINUX
 
 
 declare void @bar(i8*)