diff --git a/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h b/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
--- a/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
@@ -291,6 +291,14 @@
 
   /// This is called after a .mir file was loaded.
   virtual void mirFileLoaded(MachineFunction &MF) const;
+
+  /// True if the register allocator should use the allocation orders exactly as
+  /// written in the tablegen descriptions, false if it should allocate
+  /// the specified physical register later if is it callee-saved.
+  virtual bool ignoreCSRForAllocationOrder(const MachineFunction &MF,
+                                           unsigned PhysReg) const {
+    return false;
+  }
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/CodeGen/RegisterClassInfo.cpp b/llvm/lib/CodeGen/RegisterClassInfo.cpp
--- a/llvm/lib/CodeGen/RegisterClassInfo.cpp
+++ b/llvm/lib/CodeGen/RegisterClassInfo.cpp
@@ -90,6 +90,7 @@
 void RegisterClassInfo::compute(const TargetRegisterClass *RC) const {
   assert(RC && "no register class given");
   RCInfo &RCI = RegClass[RC->getID()];
+  auto &STI = MF->getSubtarget();
 
   // Raw register count, including all reserved regs.
   unsigned NumRegs = RC->getNumRegs();
@@ -114,7 +115,8 @@
     unsigned Cost = TRI->getCostPerUse(PhysReg);
     MinCost = std::min(MinCost, Cost);
 
-    if (CalleeSavedAliases[PhysReg])
+    if (CalleeSavedAliases[PhysReg] &&
+        !STI.ignoreCSRForAllocationOrder(*MF, PhysReg))
       // PhysReg aliases a CSR, save it for later.
       CSRAlias.push_back(PhysReg);
     else {
diff --git a/llvm/lib/Target/ARM/ARMRegisterInfo.td b/llvm/lib/Target/ARM/ARMRegisterInfo.td
--- a/llvm/lib/Target/ARM/ARMRegisterInfo.td
+++ b/llvm/lib/Target/ARM/ARMRegisterInfo.td
@@ -227,9 +227,10 @@
   // know how to spill them. If we make our prologue/epilogue code smarter at
   // some point, we can go back to using the above allocation orders for the
   // Thumb1 instructions that know how to use hi regs.
-  let AltOrders = [(add LR, GPR), (trunc GPR, 8)];
+  let AltOrders = [(add LR, GPR), (trunc GPR, 8),
+                   (add (trunc GPR, 8), R12, LR, (shl GPR, 8))];
   let AltOrderSelect = [{
-      return 1 + MF.getSubtarget<ARMSubtarget>().isThumb1Only();
+      return MF.getSubtarget<ARMSubtarget>().getGPRAllocationOrder(MF);
   }];
   let DiagnosticString = "operand must be a register in range [r0, r15]";
 }
@@ -238,9 +239,10 @@
 // certain operand slots, particularly as the destination.  Primarily
 // useful for disassembly.
 def GPRnopc : RegisterClass<"ARM", [i32], 32, (sub GPR, PC)> {
-  let AltOrders = [(add LR, GPRnopc), (trunc GPRnopc, 8)];
+  let AltOrders = [(add LR, GPRnopc), (trunc GPRnopc, 8),
+                   (add (trunc GPRnopc, 8), R12, LR, (shl GPRnopc, 8))];
   let AltOrderSelect = [{
-      return 1 + MF.getSubtarget<ARMSubtarget>().isThumb1Only();
+      return MF.getSubtarget<ARMSubtarget>().getGPRAllocationOrder(MF);
   }];
   let DiagnosticString = "operand must be a register in range [r0, r14]";
 }
@@ -295,9 +297,10 @@
 // or SP (R13 or R15) are used. The ARM ISA refers to these operands
 // via the BadReg() pseudo-code description.
 def rGPR : RegisterClass<"ARM", [i32], 32, (sub GPR, SP, PC)> {
-  let AltOrders = [(add LR, rGPR), (trunc rGPR, 8)];
+  let AltOrders = [(add LR, rGPR), (trunc rGPR, 8),
+                   (add (trunc rGPR, 8), R12, LR, (shl rGPR, 8))];
   let AltOrderSelect = [{
-      return 1 + MF.getSubtarget<ARMSubtarget>().isThumb1Only();
+      return MF.getSubtarget<ARMSubtarget>().getGPRAllocationOrder(MF);
   }];
   let DiagnosticType = "rGPR";
 }
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h
--- a/llvm/lib/Target/ARM/ARMSubtarget.h
+++ b/llvm/lib/Target/ARM/ARMSubtarget.h
@@ -856,6 +856,10 @@
   unsigned getPrefLoopAlignment() const {
     return PrefLoopAlignment;
   }
+
+  bool ignoreCSRForAllocationOrder(const MachineFunction &MF,
+                                   unsigned PhysReg) const override;
+  unsigned getGPRAllocationOrder(const MachineFunction &MF) const;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.cpp b/llvm/lib/Target/ARM/ARMSubtarget.cpp
--- a/llvm/lib/Target/ARM/ARMSubtarget.cpp
+++ b/llvm/lib/Target/ARM/ARMSubtarget.cpp
@@ -413,3 +413,45 @@
          ((isTargetMachO() && !isThumb1Only()) ||
           (isTargetLinux() && !isThumb()) || (isTargetNaCl() && !isThumb()));
 }
+
+unsigned ARMSubtarget::getGPRAllocationOrder(const MachineFunction &MF) const {
+  // The GPR register class has multiple possible allocation orders, with
+  // tradeoffs preferred by different sub-architectures and optimisation goals.
+  // The allocation orders are:
+  // 0: (the default tablegen order, not used)
+  // 1: r14, r0-r13
+  // 2: r0-r7
+  // 3: r0-r7, r12, lr, r8-r11
+  // Note that the register allocator will change this order so that
+  // callee-saved registers are used later, as they require extra work in the
+  // prologue/epilogue (though we sometimes override that).
+
+  // For thumb1-only targets, only the low registers are allocatable.
+  if (isThumb1Only())
+    return 2;
+
+  // Allocate low registers first, so we can select more 16-bit instructions.
+  // We also (in ignoreCSRForAllocationOrder) override  the default behaviour
+  // with regards to callee-saved registers, because pushing extra registers is
+  // much cheaper (in terms of code size) than using high registers. After
+  // that, we allocate r12 (doesn't need to be saved), lr (saving it means we
+  // can return with the pop, don't need an extra "bx lr") and then the rest of
+  // the high registers.
+  if (isThumb2() && MF.getFunction().hasMinSize())
+    return 3;
+
+  // Otherwise, allocate in the default order, using LR first because saving it
+  // allows a shorter epilogue sequence.
+  return 1;
+}
+
+bool ARMSubtarget::ignoreCSRForAllocationOrder(const MachineFunction &MF,
+                                               unsigned PhysReg) const {
+  // To minimize code size in Thumb2, we prefer the usage of low regs (lower
+  // cost per use) so we can  use narrow encoding. By default, caller-saved
+  // registers (e.g. lr, r12) are always  allocated first, regardless of
+  // their cost per use. When optForMinSize, we prefer the low regs even if
+  // they are CSR because usually push/pop can be folded into existing ones.
+  return isThumb2() && MF.getFunction().hasMinSize() &&
+         ARM::GPRRegClass.contains(PhysReg);
+}
diff --git a/llvm/test/CodeGen/ARM/avoid-cpsr-rmw.ll b/llvm/test/CodeGen/ARM/avoid-cpsr-rmw.ll
--- a/llvm/test/CodeGen/ARM/avoid-cpsr-rmw.ll
+++ b/llvm/test/CodeGen/ARM/avoid-cpsr-rmw.ll
@@ -60,7 +60,7 @@
 
 while.body:
 ; CHECK: while.body
-; CHECK: mul r{{[0-9]+}}
+; CHECK: muls r{{[0-9]+}}
 ; CHECK: muls
   %ptr1.addr.09 = phi i32* [ %add.ptr, %while.body ], [ %ptr1, %entry ]
   %ptr2.addr.08 = phi i32* [ %incdec.ptr, %while.body ], [ %ptr2, %entry ]
diff --git a/llvm/test/CodeGen/ARM/favor-low-reg-for-Osize.ll b/llvm/test/CodeGen/ARM/favor-low-reg-for-Osize.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/favor-low-reg-for-Osize.ll
@@ -0,0 +1,29 @@
+; REQUIRES: asserts
+; RUN: llc -debug-only=regalloc < %s 2>%t | FileCheck %s --check-prefix=CHECK
+; RUN: FileCheck %s < %t --check-prefix=DEBUG
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n8:16:32-S64"
+target triple = "thumbv7m--linux-gnueabi"
+
+
+; DEBUG:         AllocationOrder(GPR) = [ $r0 $r1 $r2 $r3 $r4 $r5 $r6 $r7 $r12 $lr $r8 $r9 $r10 $r11 ]
+
+define i32 @test_minsize(i32 %x) optsize minsize {
+; CHECK-LABEL: test_minsize:
+entry:
+; CHECK: mov     r4, r0
+  tail call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3}"()
+; CHECK: mov     r0, r4
+  ret i32 %x
+}
+
+; DEBUG: AllocationOrder(GPR) = [ $r0 $r1 $r2 $r3 $r12 $lr $r4 $r5 $r6 $r7 $r8 $r9 $r10 $r11 ]
+
+define i32 @test_optsize(i32 %x) optsize {
+; CHECK-LABEL: test_optsize:
+entry:
+; CHECK: mov     r12, r0
+  tail call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3}"()
+; CHECK: mov     r0, r12
+  ret i32 %x
+}