Index: include/llvm/Target/TargetRegisterInfo.h
===================================================================
--- include/llvm/Target/TargetRegisterInfo.h
+++ include/llvm/Target/TargetRegisterInfo.h
@@ -582,6 +582,12 @@
   static void dumpReg(unsigned Reg, unsigned SubRegIndex = 0,
                       const TargetRegisterInfo* TRI = nullptr);
 
+  /// Returns true to ignore the preferring of CSR when computing the preferred
+  /// register allocation order.
+  virtual bool ignoreCSRForAllocationOrder(const MachineFunction &MF) const {
+    return false;
+  }
+
 protected:
   /// Overridden by TableGen in targets that have sub-registers.
   virtual unsigned composeSubRegIndicesImpl(unsigned, unsigned) const {
Index: lib/CodeGen/RegisterClassInfo.cpp
===================================================================
--- lib/CodeGen/RegisterClassInfo.cpp
+++ lib/CodeGen/RegisterClassInfo.cpp
@@ -114,7 +114,7 @@
     unsigned Cost = TRI->getCostPerUse(PhysReg);
     MinCost = std::min(MinCost, Cost);
 
-    if (CalleeSavedAliases[PhysReg])
+    if (CalleeSavedAliases[PhysReg] && !TRI->ignoreCSRForAllocationOrder(*MF))
       // PhysReg aliases a CSR, save it for later.
       CSRAlias.push_back(PhysReg);
     else {
Index: lib/Target/ARM/ARMBaseRegisterInfo.h
===================================================================
--- lib/Target/ARM/ARMBaseRegisterInfo.h
+++ lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -205,6 +205,8 @@
                       const TargetRegisterClass *DstRC,
                       unsigned DstSubReg,
                       const TargetRegisterClass *NewRC) const override;
+
+  bool ignoreCSRForAllocationOrder(const MachineFunction &MF) const override;
 };
 
 } // end namespace llvm
Index: lib/Target/ARM/ARMBaseRegisterInfo.cpp
===================================================================
--- lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -848,3 +848,13 @@
   }
   return false;
 }
+
+bool ARMBaseRegisterInfo::ignoreCSRForAllocationOrder(
+    const MachineFunction &MF) const {
+  // To minimize code size in Thumb2, we prefer the usage of low regs (lower
+  // cost per use) so we can  use narrow encoding. By default, caller-saved
+  // registers (e.g. lr, r12) are always  allocated first, regardless of
+  // their cost per use. When optForMinSize, we prefer the low regs even if
+  // they are CSR because usually push/pop can be folded into existing ones.
+  return MF.getSubtarget<ARMSubtarget>().favorLowRegOverR12LR(MF);
+}
Index: lib/Target/ARM/ARMRegisterInfo.td
===================================================================
--- lib/Target/ARM/ARMRegisterInfo.td
+++ lib/Target/ARM/ARMRegisterInfo.td
@@ -197,9 +197,11 @@
   // know how to spill them. If we make our prologue/epilogue code smarter at
   // some point, we can go back to using the above allocation orders for the
   // Thumb1 instructions that know how to use hi regs.
-  let AltOrders = [(add LR, GPR), (trunc GPR, 8)];
+  let AltOrders = [(add LR, GPR), (trunc GPR, 8),
+                   (add (trunc GPR, 8), R12, LR, (shl GPR, 8))];
   let AltOrderSelect = [{
-      return 1 + MF.getSubtarget<ARMSubtarget>().isThumb1Only();
+      return MF.getSubtarget<ARMSubtarget>().favorLowRegOverR12LR(MF) ?
+        3 : (1 + MF.getSubtarget<ARMSubtarget>().isThumb1Only());
   }];
 }
 
@@ -207,9 +209,11 @@
 // certain operand slots, particularly as the destination.  Primarily
 // useful for disassembly.
 def GPRnopc : RegisterClass<"ARM", [i32], 32, (sub GPR, PC)> {
-  let AltOrders = [(add LR, GPRnopc), (trunc GPRnopc, 8)];
+  let AltOrders = [(add LR, GPRnopc), (trunc GPRnopc, 8),
+                   (add (trunc GPRnopc, 8), R12, LR, (shl GPRnopc, 8))];
   let AltOrderSelect = [{
-      return 1 + MF.getSubtarget<ARMSubtarget>().isThumb1Only();
+    return MF.getSubtarget<ARMSubtarget>().favorLowRegOverR12LR(MF) ?
+        3 : (1 + MF.getSubtarget<ARMSubtarget>().isThumb1Only());
   }];
 }
 
@@ -235,9 +239,11 @@
 // or SP (R13 or R15) are used. The ARM ISA refers to these operands
 // via the BadReg() pseudo-code description.
 def rGPR : RegisterClass<"ARM", [i32], 32, (sub GPR, SP, PC)> {
-  let AltOrders = [(add LR, rGPR), (trunc rGPR, 8)];
+  let AltOrders = [(add LR, rGPR), (trunc rGPR, 8),
+                   (add (trunc rGPR, 8), R12, LR, (shl rGPR, 8))];
   let AltOrderSelect = [{
-      return 1 + MF.getSubtarget<ARMSubtarget>().isThumb1Only();
+    return MF.getSubtarget<ARMSubtarget>().favorLowRegOverR12LR(MF) ?
+        3 : (1 + MF.getSubtarget<ARMSubtarget>().isThumb1Only());
   }];
 }
 
Index: lib/Target/ARM/ARMSubtarget.h
===================================================================
--- lib/Target/ARM/ARMSubtarget.h
+++ lib/Target/ARM/ARMSubtarget.h
@@ -710,6 +710,8 @@
 
   /// True if fast-isel is used.
   bool useFastISel() const;
+
+  bool favorLowRegOverR12LR(const MachineFunction &MF) const;
 };
 
 } // end namespace llvm
Index: lib/Target/ARM/ARMSubtarget.cpp
===================================================================
--- lib/Target/ARM/ARMSubtarget.cpp
+++ lib/Target/ARM/ARMSubtarget.cpp
@@ -69,6 +69,9 @@
 ForceFastISel("arm-force-fast-isel",
                cl::init(false), cl::Hidden);
 
+static cl::opt<bool> FavorLowRegOverR12LR("arm-favor-r4-r7", cl::Hidden,
+                                          cl::desc("favor r4-r7 over r12, lr"));
+
 /// initializeSubtargetDependencies - Initializes using a CPU and feature string
 /// so that we can use initializer lists for subtarget initialization.
 ARMSubtarget &ARMSubtarget::initializeSubtargetDependencies(StringRef CPU,
@@ -327,6 +330,18 @@
   return false;
 }
 
+bool ARMSubtarget::favorLowRegOverR12LR(const MachineFunction &MF) const {
+  if (!isThumb2())
+    return false;
+
+  if (FavorLowRegOverR12LR.getNumOccurrences())
+    return FavorLowRegOverR12LR;
+
+  // Favoring R12 and LR over CSR may have performance penalty, so we enable
+  // it only when opt for min size.
+  return MF.getFunction()->optForMinSize();
+}
+
 unsigned ARMSubtarget::getMispredictionPenalty() const {
   return SchedModel.MispredictPenalty;
 }
Index: test/CodeGen/ARM/favor-low-reg-for-Osize.ll
===================================================================
--- /dev/null
+++ test/CodeGen/ARM/favor-low-reg-for-Osize.ll
@@ -0,0 +1,75 @@
+; REQUIRES: asserts
+; RUN: llc -debug-only=regalloc < %s 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=LowRegFirst
+; RUN: llc -debug-only=regalloc -arm-favor-r4-r7=false  < %s 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=CallerSavedRegFirst
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n8:16:32-S64"
+target triple = "thumbv7m--linux-gnueabi"
+
+;CHECK-LABEL: test
+
+%struct.ContextType = type { i8*, i8*, i8*, i8*, i32 }
+;LowRegFirst:         AllocationOrder(GPR) = [ %R0 %R1 %R2 %R3 %R4 %R5 %R6 %R7 %R12 %LR %R8 %R9 %R10 %R11 ]
+;LowRegFirst:         AllocationOrder(GPRnopc) = [ %R0 %R1 %R2 %R3 %R4 %R5 %R6 %R7 %R12 %LR %R8 %R9 %R10 %R11 ]
+;LowRegFirst:         AllocationOrder(rGPR) = [ %R0 %R1 %R2 %R3 %R4 %R5 %R6 %R7 %R12 %LR %R8 %R9 %R10 %R11 ]
+
+;CallerSavedRegFirst: AllocationOrder(GPR) = [ %R0 %R1 %R2 %R3 %R12 %LR %R4 %R5 %R6 %R7 %R8 %R9 %R10 %R11 ]
+;CallerSavedRegFirst: AllocationOrder(GPRnopc) = [ %R0 %R1 %R2 %R3 %R12 %LR %R4 %R5 %R6 %R7 %R8 %R9 %R10 %R11 ]
+;CallerSavedRegFirst: AllocationOrder(rGPR) = [ %R0 %R1 %R2 %R3 %R12 %LR %R4 %R5 %R6 %R7 %R8 %R9 %R10 %R11 ]
+
+
+; Function Attrs: minsize norecurse nounwind optsize
+define void @test(%struct.ContextType* nocapture readonly %pContext) local_unnamed_addr #0 {
+entry:
+
+;LowRegFirst-NOT: ldr{{.*}}.w
+;LowRegFirst-NOT: str{{.*}}.w
+
+;CallerSavedRegFirst: ldr{{.*}}.w
+;CallerSavedRegFirst: ldr{{.*}}.w
+;CallerSavedRegFirst: str{{.*}}.w
+
+  %Src1 = getelementptr inbounds %struct.ContextType, %struct.ContextType* %pContext, i32 0, i32 0
+  %0 = load i8*, i8** %Src1, align 4
+  %Len = getelementptr inbounds %struct.ContextType, %struct.ContextType* %pContext, i32 0, i32 4
+  %1 = load i32, i32* %Len, align 4
+  %Dst = getelementptr inbounds %struct.ContextType, %struct.ContextType* %pContext, i32 0, i32 2
+  %2 = load i8*, i8** %Dst, align 4
+  %arrayidx = getelementptr inbounds i8, i8* %0, i32 24
+  %3 = bitcast i8* %arrayidx to i32*
+  %4 = load i32, i32* %3, align 4
+  %arrayidx2 = getelementptr inbounds i8, i8* %0, i32 18
+  %5 = bitcast i8* %arrayidx2 to i16*
+  %6 = load i16, i16* %5, align 2
+  %arrayidx3 = getelementptr inbounds i8, i8* %0, i32 16
+  %7 = bitcast i8* %arrayidx3 to i16*
+  %8 = load i16, i16* %7, align 2
+  %arrayidx4 = getelementptr inbounds i8, i8* %0, i32 12
+  %9 = bitcast i8* %arrayidx4 to i32*
+  %10 = load i32, i32* %9, align 4
+  %arrayidx5 = getelementptr inbounds i8, i8* %0, i32 2
+  %11 = bitcast i8* %arrayidx5 to i16*
+  %12 = load i16, i16* %11, align 2
+  %add = add i32 %1, 24
+  %arrayidx6 = getelementptr inbounds i8, i8* %2, i32 %add
+  %13 = bitcast i8* %arrayidx6 to i32*
+  store i32 %4, i32* %13, align 4
+  %add7 = add i32 %1, 18
+  %arrayidx8 = getelementptr inbounds i8, i8* %2, i32 %add7
+  %14 = bitcast i8* %arrayidx8 to i16*
+  store i16 %6, i16* %14, align 2
+  %add9 = add i32 %1, 16
+  %arrayidx10 = getelementptr inbounds i8, i8* %2, i32 %add9
+  %15 = bitcast i8* %arrayidx10 to i16*
+  store i16 %8, i16* %15, align 2
+  %add11 = add i32 %1, 12
+  %arrayidx12 = getelementptr inbounds i8, i8* %2, i32 %add11
+  %16 = bitcast i8* %arrayidx12 to i32*
+  store i32 %10, i32* %16, align 4
+  %add13 = add i32 %1, 2
+  %arrayidx14 = getelementptr inbounds i8, i8* %2, i32 %add13
+  %17 = bitcast i8* %arrayidx14 to i16*
+  store i16 %12, i16* %17, align 2
+  ret void
+}
+
+attributes #0 = { minsize norecurse nounwind optsize "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m3" "target-features"="+hwdiv" "unsafe-fp-math"="false" "use-soft-float"="false" }
Index: test/CodeGen/ARM/fold-stack-adjust.ll
===================================================================
--- test/CodeGen/ARM/fold-stack-adjust.ll
+++ test/CodeGen/ARM/fold-stack-adjust.ll
@@ -1,9 +1,9 @@
 ; Disable shrink-wrapping on the first test otherwise we wouldn't
 ; exerce the path for PR18136.
-; RUN: llc -mtriple=thumbv7-apple-none-macho < %s -enable-shrink-wrap=false | FileCheck %s
+; RUN: llc -mtriple=thumbv7-apple-none-macho -arm-favor-r4-r7=false < %s -enable-shrink-wrap=false | FileCheck %s
 ; RUN: llc -mtriple=thumbv6m-apple-none-macho -disable-fp-elim < %s | FileCheck %s --check-prefix=CHECK-T1
-; RUN: llc -mtriple=thumbv7-apple-darwin-ios -disable-fp-elim < %s | FileCheck %s --check-prefix=CHECK-IOS
-; RUN: llc -mtriple=thumbv7--linux-gnueabi -disable-fp-elim < %s | FileCheck %s --check-prefix=CHECK-LINUX
+; RUN: llc -mtriple=thumbv7-apple-darwin-ios -disable-fp-elim -arm-favor-r4-r7=false < %s | FileCheck %s --check-prefix=CHECK-IOS
+; RUN: llc -mtriple=thumbv7--linux-gnueabi -disable-fp-elim -arm-favor-r4-r7=false < %s | FileCheck %s --check-prefix=CHECK-LINUX
 
 
 declare void @bar(i8*)