Index: lib/CodeGen/RegAllocGreedy.cpp
===================================================================
--- lib/CodeGen/RegAllocGreedy.cpp
+++ lib/CodeGen/RegAllocGreedy.cpp
@@ -97,10 +97,10 @@
     cl::init(false));
 
 // FIXME: Find a good default for this flag and remove the flag.
-static cl::opt<unsigned>
-CSRFirstTimeCost("regalloc-csr-first-time-cost",
-              cl::desc("Cost for first time use of callee-saved register."),
-              cl::init(0), cl::Hidden);
+static cl::opt<int> CSRFirstTimeCost(
+    "regalloc-csr-first-time-cost",
+    cl::desc("Cost for first time use of callee-saved register."), cl::init(-1),
+    cl::Hidden);
 
 static RegisterRegAlloc greedyRegAlloc("greedy", "greedy register allocator",
                                        createGreedyRegisterAllocator);
@@ -2351,10 +2351,11 @@
 }
 
 void RAGreedy::initializeCSRCost() {
-  // We use the larger one out of the command-line option and the value report
-  // by TRI.
-  CSRCost = BlockFrequency(
-      std::max((unsigned)CSRFirstTimeCost, TRI->getCSRFirstUseCost()));
+  // The cost from command-line option will override the value reported by TRI.
+  if (CSRFirstTimeCost != -1)
+    CSRCost = BlockFrequency(CSRFirstTimeCost);
+  else
+    CSRCost = BlockFrequency(TRI->getCSRFirstUseCost());
   if (!CSRCost.getFrequency())
     return;
 
@@ -2468,9 +2469,16 @@
       DEBUG(dbgs() << "Checking profitability:\n");
       BlockFrequency OldCopiesCost = getBrokenHintFreq(Info, CurrPhys);
       BlockFrequency NewCopiesCost = getBrokenHintFreq(Info, PhysReg);
+      // If we switch from a non-CSR register to a CSR register, we require
+      // the cost difference is larger than CSRCost to justify the recoloring,
+      // because such recoloring is possible to hinder shrinkwrapping.
+      bool ChangedToCSR = !RegClassInfo.getLastCalleeSavedAlias(CurrPhys) &&
+                          RegClassInfo.getLastCalleeSavedAlias(PhysReg);
       DEBUG(dbgs() << "Old Cost: " << OldCopiesCost.getFrequency()
-                   << "\nNew Cost: " << NewCopiesCost.getFrequency() << '\n');
-      if (OldCopiesCost < NewCopiesCost) {
+                   << "\nNew Cost: " << NewCopiesCost.getFrequency()
+                   << "\nCSRCost: "
+                   << (ChangedToCSR ? CSRCost.getFrequency() : 0) << '\n');
+      if (OldCopiesCost < NewCopiesCost + (ChangedToCSR ? CSRCost : 0)) {
         DEBUG(dbgs() << "=> Not profitable.\n");
         continue;
       }
Index: lib/Target/AArch64/AArch64RegisterInfo.h
===================================================================
--- lib/Target/AArch64/AArch64RegisterInfo.h
+++ lib/Target/AArch64/AArch64RegisterInfo.h
@@ -39,12 +39,7 @@
   const uint32_t *getCallPreservedMask(const MachineFunction &MF,
                                        CallingConv::ID) const override;
 
-  unsigned getCSRFirstUseCost() const override {
-    // The cost will be compared against BlockFrequency where entry has the
-    // value of 1 << 14. A value of 5 will choose to spill or split really
-    // cold path instead of using a callee-saved register.
-    return 5;
-  }
+  unsigned getCSRFirstUseCost() const override { return 1 << 13; }
 
   // Calls involved in thread-local variable lookup save more registers than
   // normal calls, so they need a different mask to represent this.
Index: lib/Target/ARM/ARMBaseRegisterInfo.h
===================================================================
--- lib/Target/ARM/ARMBaseRegisterInfo.h
+++ lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -150,6 +150,8 @@
                              const VirtRegMap *VRM,
                              const LiveRegMatrix *Matrix) const override;
 
+  unsigned getCSRFirstUseCost() const override { return 1 << 13; }
+
   void updateRegAllocHint(unsigned Reg, unsigned NewReg,
                           MachineFunction &MF) const override;
 
Index: lib/Target/PowerPC/PPCRegisterInfo.h
===================================================================
--- lib/Target/PowerPC/PPCRegisterInfo.h
+++ lib/Target/PowerPC/PPCRegisterInfo.h
@@ -84,6 +84,8 @@
 
   BitVector getReservedRegs(const MachineFunction &MF) const override;
 
+  unsigned getCSRFirstUseCost() const override { return 1 << 13; }
+
   /// We require the register scavenger.
   bool requiresRegisterScavenging(const MachineFunction &MF) const override {
     return true;
Index: lib/Target/X86/X86RegisterInfo.h
===================================================================
--- lib/Target/X86/X86RegisterInfo.h
+++ lib/Target/X86/X86RegisterInfo.h
@@ -105,6 +105,8 @@
                                        CallingConv::ID) const override;
   const uint32_t *getNoPreservedMask() const override;
 
+  unsigned getCSRFirstUseCost() const override { return 1 << 13; }
+
   // Calls involved in thread-local variable lookup save more registers than
   // normal calls, so they need a different mask to represent this.
   const uint32_t *getDarwinTLSCallPreservedMask() const;
Index: test/CodeGen/ARM/divmod-eabi.ll
===================================================================
--- test/CodeGen/ARM/divmod-eabi.ll
+++ test/CodeGen/ARM/divmod-eabi.ll
@@ -225,6 +225,7 @@
 ; DARWIN: __modsi3
 ; DARWIN: mov [[sum:r[0-9]+]], r0
 ; WINDOWS: __rt_sdiv
+; WINDOWS: mov r0, r1
 ; WINDOWS: mov [[rem:r[0-9]+]], r1
   %rem1 = srem i32 %b, %rem
 ; EABI: __aeabi_idivmod
Index: test/CodeGen/X86/atom-fixup-lea2.ll
===================================================================
--- test/CodeGen/X86/atom-fixup-lea2.ll
+++ test/CodeGen/X86/atom-fixup-lea2.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -mcpu=atom -mtriple=i686-linux | FileCheck %s
 ; CHECK:BB#5
-; CHECK-NEXT:leal
+; CHECK-NEXT:addl 12(%esp), %esi
 ; CHECK-NEXT:leal
 ; CHECK-NEXT:leal
 ; CHECK-NEXT:movl
Index: test/CodeGen/X86/block-placement.ll
===================================================================
--- test/CodeGen/X86/block-placement.ll
+++ test/CodeGen/X86/block-placement.ll
@@ -1067,7 +1067,6 @@
 ; CHECK-LABEL: test_cold_calls:
 ; CHECK: %entry
 ; CHECK: %else
-; CHECK: %exit
 ; CHECK: %then
 
 entry:
Index: test/CodeGen/X86/csr-split1.ll
===================================================================
--- test/CodeGen/X86/csr-split1.ll
+++ test/CodeGen/X86/csr-split1.ll
@@ -0,0 +1,82 @@
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+; Check CSR split can work properly for tests below.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+@cond = common local_unnamed_addr global i64 0, align 8
+@ret = common local_unnamed_addr global i64 0, align 8
+@p = common local_unnamed_addr global i64* null, align 8
+declare void @foo(i64, i64, i64)
+
+; After CSR split, ShrinkWrap is enabled and prologue are moved from entry
+; to BB#1.
+;
+; CHECK-LABEL: test1:
+; CHECK: .LBB0_1:
+; CHECK-NEXT:   push
+; CHECK-NEXT:   push
+; CHECK-NEXT:   push
+; CHECK:        callq foo
+; CHECK:        pop
+; CHECK-NEXT:   pop
+; CHECK-NEXT:   pop
+; CHECK-NEXT:   jmp .LBB0_2
+define void @test1(i64 %i, i64 %j, i64 %k) nounwind {
+entry:
+  %t0 = load i64, i64* @cond, align 8
+  %tobool = icmp eq i64 %t0, 0
+  br i1 %tobool, label %if.end, label %if.then, !prof !0
+
+if.then:                                          ; preds = %entry
+  tail call void @foo(i64 %i, i64 %j, i64 %k)
+  %add = add nsw i64 %j, %i
+  %add1 = add nsw i64 %add, %k
+  store i64 %add1, i64* @ret, align 8
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  %t1 = load i64*, i64** @p, align 8
+  store volatile i64 3, i64* %t1, align 8
+  ret void
+}
+!0 = !{!"branch_weights", i32 2000, i32 1}
+
+; After CSR split, even if ShrinkWrap is not enabled because stack
+; alloc space is used. param passing moves still can be moved from
+; prologue to BB#1.
+;
+; CHECK-LABEL: test2:
+; CHECK-NEXT: # BB#0:
+; CHECK-NEXT:   pushq  %rbp
+; CHECK-NEXT:   movq  %rsp, %rbp
+; CHECK-NEXT:   pushq  %r15
+; CHECK-NEXT:   pushq  %r14
+; CHECK-NEXT:   pushq  %rbx
+; CHECK-NEXT:   pushq  %rax
+; CHECK-NEXT:   cmpq
+; CHECK-NEXT:   jne .LBB1_1
+; CHECK:      .LBB1_1:
+; CHECK-NEXT:   movq  %rdi,
+; CHECK-NEXT:   movq  %rsi,
+; CHECK-NEXT:   movq  %rdx,
+; CHECK-NEXT:   callq  foo
+;
+define void @test2(i64 %i, i64 %j, i64 %k) nounwind {
+entry:
+  %t0 = load i64, i64* @cond, align 8
+  %tobool = icmp eq i64 %t0, 0
+  br i1 %tobool, label %if.end, label %if.then, !prof !1
+
+if.then:                                          ; preds = %entry
+  tail call void @foo(i64 %i, i64 %j, i64 %k)
+  %add = add nsw i64 %j, %i
+  %add1 = add nsw i64 %add, %k
+  store i64 %add1, i64* @ret, align 8
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  %t1 = alloca [3 x i8], align 16
+  store [3 x i8]* %t1, [3 x i8]** bitcast (i64** @p to [3 x i8]**), align 8
+  ret void
+}
+!1 = !{!"branch_weights", i32 2000, i32 1}
Index: test/CodeGen/X86/mul-i1024.ll
===================================================================
--- test/CodeGen/X86/mul-i1024.ll
+++ test/CodeGen/X86/mul-i1024.ll
@@ -4460,7 +4460,8 @@
 ; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
 ; X64-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    movq %r14, %r9
+; X64-NEXT:    movq %r14, %rdi
+; X64-NEXT:    movq %rdi, %r9
 ; X64-NEXT:    addq %rax, %r9
 ; X64-NEXT:    movq %rcx, %rax
 ; X64-NEXT:    adcq %rdx, %rax
@@ -4469,7 +4470,7 @@
 ; X64-NEXT:    movq %rax, %rbp
 ; X64-NEXT:    movq %r11, %rax
 ; X64-NEXT:    movq %r11, -{{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    addq %r14, %rax
+; X64-NEXT:    addq %rdi, %rax
 ; X64-NEXT:    adcq %rcx, %r15
 ; X64-NEXT:    movq %r15, {{[0-9]+}}(%rsp) # 8-byte Spill
 ; X64-NEXT:    movq (%r10), %rax
@@ -4477,9 +4478,8 @@
 ; X64-NEXT:    xorl %r15d, %r15d
 ; X64-NEXT:    mulq %r15
 ; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %r14, %rax
-; X64-NEXT:    movq %r14, %rdi
+; X64-NEXT:    movq %rax, %r14
+; X64-NEXT:    addq %rdi, %rax
 ; X64-NEXT:    movq %rsi, %rax
 ; X64-NEXT:    adcq %rcx, %rax
 ; X64-NEXT:    movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
@@ -4489,8 +4489,7 @@
 ; X64-NEXT:    xorl %r8d, %r8d
 ; X64-NEXT:    movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
 ; X64-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    movq %rbx, %rcx
-; X64-NEXT:    movq %rbx, %r14
+; X64-NEXT:    movq %r14, %rcx
 ; X64-NEXT:    addq %rax, %rcx
 ; X64-NEXT:    movq %rsi, %rax
 ; X64-NEXT:    adcq %rdx, %rax
@@ -5638,8 +5637,8 @@
 ; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
 ; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq 72(%rsi), %rbx
-; X64-NEXT:    movq %rbx, %rax
+; X64-NEXT:    movq 72(%rsi), %rax
+; X64-NEXT:    movq %rax, %rbx
 ; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %rbp
@@ -5812,9 +5811,9 @@
 ; X64-NEXT:    addq %r13, %r9
 ; X64-NEXT:    adcq %rdi, %r8
 ; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rsi # 8-byte Reload
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rbx # 8-byte Reload
-; X64-NEXT:    imulq %rbx, %rsi
-; X64-NEXT:    movq %rbx, %rax
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload
+; X64-NEXT:    imulq %rax, %rsi
+; X64-NEXT:    movq %rax, %rbx
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload
 ; X64-NEXT:    mulq %rcx
 ; X64-NEXT:    movq %rax, %r10
Index: test/CodeGen/X86/mul-i512.ll
===================================================================
--- test/CodeGen/X86/mul-i512.ll
+++ test/CodeGen/X86/mul-i512.ll
@@ -956,15 +956,15 @@
 ; X64-NEXT:    mulq %rcx
 ; X64-NEXT:    movq %rdx, %r9
 ; X64-NEXT:    movq %r9, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; X64-NEXT:    movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
 ; X64-NEXT:    movq %rax, %r15
-; X64-NEXT:    movq %r15, -{{[0-9]+}}(%rsp) # 8-byte Spill
 ; X64-NEXT:    addq %r10, %r15
 ; X64-NEXT:    adcq %r13, %r9
 ; X64-NEXT:    addq %rbp, %r15
 ; X64-NEXT:    adcq %rsi, %r9
 ; X64-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp) # 8-byte Spill
-; X64-NEXT:    movq (%rdi), %r14
-; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    movq %rax, %r14
 ; X64-NEXT:    mulq %rbx
 ; X64-NEXT:    movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
 ; X64-NEXT:    movq %rdx, %r11
Index: test/CodeGen/X86/ragreedy-bug.ll
===================================================================
--- test/CodeGen/X86/ragreedy-bug.ll
+++ test/CodeGen/X86/ragreedy-bug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-macosx -regalloc=greedy | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -regalloc=greedy -regalloc-csr-first-time-cost=0 | FileCheck %s
 
 ; This testing case is reduced from 197.parser prune_match function.
 ; We make sure register copies are not generated on isupper.exit blocks.
Index: test/CodeGen/X86/ragreedy-hoist-spill.ll
===================================================================
--- test/CodeGen/X86/ragreedy-hoist-spill.ll
+++ test/CodeGen/X86/ragreedy-hoist-spill.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-macosx -regalloc=greedy | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -regalloc=greedy -regalloc-csr-first-time-cost=0 | FileCheck %s
 
 ; This testing case is reduced from 254.gap SyFgets function.
 ; We make sure a spill is hoisted to a cold BB inside the hotter outer loop.
Index: test/CodeGen/X86/sjlj-eh.ll
===================================================================
--- test/CodeGen/X86/sjlj-eh.ll
+++ test/CodeGen/X86/sjlj-eh.ll
@@ -56,10 +56,10 @@
 ; CHECK: calll __Z20function_that_throwsv
 ;     _Unwind_SjLj_Unregister(&UFC);
 ; CHECK: leal -64(%ebp), %eax
+; CHECK: pushl %eax
 ; CHECK: calll __Unwind_SjLj_Unregister
 ;
 ; CHECK: [[RESUME]]:
-; CHECK: leal -64(%ebp), %esi
 ;     assert(UFC.__callsite <= 1);
 ; CHECK: movl -60(%ebp), %eax
 ; CHECK: cmpl $1, %eax
Index: test/CodeGen/X86/x86-shrink-wrapping.ll
===================================================================
--- test/CodeGen/X86/x86-shrink-wrapping.ll
+++ test/CodeGen/X86/x86-shrink-wrapping.ll
@@ -941,7 +941,6 @@
 ; Make sure the epilogue happens in the exit block.
 ; CHECK-NOT: popq
 ; CHECK: popq
-; CHECK-NEXT: popq
 ; CHECK-NEXT: retq
 define i32 @irreducibleCFG() #4 {
 entry:
Index: test/DebugInfo/X86/live-debug-values.ll
===================================================================
--- test/DebugInfo/X86/live-debug-values.ll
+++ test/DebugInfo/X86/live-debug-values.ll
@@ -1,4 +1,4 @@
-; RUN: llc -filetype=asm %s -o - | FileCheck %s
+; RUN: llc -filetype=asm -regalloc-csr-first-time-cost=0 %s -o - | FileCheck %s
 
 ; Test the extension of debug ranges from predecessors.
 ; Generated from the source file LiveDebugValues.c: