Index: lib/CodeGen/RegAllocGreedy.cpp =================================================================== --- lib/CodeGen/RegAllocGreedy.cpp +++ lib/CodeGen/RegAllocGreedy.cpp @@ -97,10 +97,10 @@ cl::init(false)); // FIXME: Find a good default for this flag and remove the flag. -static cl::opt -CSRFirstTimeCost("regalloc-csr-first-time-cost", - cl::desc("Cost for first time use of callee-saved register."), - cl::init(0), cl::Hidden); +static cl::opt CSRFirstTimeCost( + "regalloc-csr-first-time-cost", + cl::desc("Cost for first time use of callee-saved register."), cl::init(-1), + cl::Hidden); static RegisterRegAlloc greedyRegAlloc("greedy", "greedy register allocator", createGreedyRegisterAllocator); @@ -2351,10 +2351,11 @@ } void RAGreedy::initializeCSRCost() { - // We use the larger one out of the command-line option and the value report - // by TRI. - CSRCost = BlockFrequency( - std::max((unsigned)CSRFirstTimeCost, TRI->getCSRFirstUseCost())); + // The cost from command-line option will override the value reported by TRI. + if (CSRFirstTimeCost != -1) + CSRCost = BlockFrequency(CSRFirstTimeCost); + else + CSRCost = BlockFrequency(TRI->getCSRFirstUseCost()); if (!CSRCost.getFrequency()) return; @@ -2468,9 +2469,16 @@ DEBUG(dbgs() << "Checking profitability:\n"); BlockFrequency OldCopiesCost = getBrokenHintFreq(Info, CurrPhys); BlockFrequency NewCopiesCost = getBrokenHintFreq(Info, PhysReg); + // If we switch from a non-CSR register to a CSR register, we require + // the cost difference is larger than CSRCost to justify the recoloring, + // because such recoloring is possible to hinder shrinkwrapping. + bool ChangedToCSR = !RegClassInfo.getLastCalleeSavedAlias(CurrPhys) && + RegClassInfo.getLastCalleeSavedAlias(PhysReg); DEBUG(dbgs() << "Old Cost: " << OldCopiesCost.getFrequency() - << "\nNew Cost: " << NewCopiesCost.getFrequency() << '\n'); - if (OldCopiesCost < NewCopiesCost) { + << "\nNew Cost: " << NewCopiesCost.getFrequency() + << "\nCSRCost: " + << (ChangedToCSR ? CSRCost.getFrequency() : 0) << '\n'); + if (OldCopiesCost < NewCopiesCost + (ChangedToCSR ? CSRCost : 0)) { DEBUG(dbgs() << "=> Not profitable.\n"); continue; } Index: lib/Target/AArch64/AArch64RegisterInfo.h =================================================================== --- lib/Target/AArch64/AArch64RegisterInfo.h +++ lib/Target/AArch64/AArch64RegisterInfo.h @@ -39,12 +39,7 @@ const uint32_t *getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override; - unsigned getCSRFirstUseCost() const override { - // The cost will be compared against BlockFrequency where entry has the - // value of 1 << 14. A value of 5 will choose to spill or split really - // cold path instead of using a callee-saved register. - return 5; - } + unsigned getCSRFirstUseCost() const override { return 1 << 13; } // Calls involved in thread-local variable lookup save more registers than // normal calls, so they need a different mask to represent this. Index: lib/Target/ARM/ARMBaseRegisterInfo.h =================================================================== --- lib/Target/ARM/ARMBaseRegisterInfo.h +++ lib/Target/ARM/ARMBaseRegisterInfo.h @@ -150,6 +150,8 @@ const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const override; + unsigned getCSRFirstUseCost() const override { return 1 << 13; } + void updateRegAllocHint(unsigned Reg, unsigned NewReg, MachineFunction &MF) const override; Index: lib/Target/PowerPC/PPCRegisterInfo.h =================================================================== --- lib/Target/PowerPC/PPCRegisterInfo.h +++ lib/Target/PowerPC/PPCRegisterInfo.h @@ -84,6 +84,8 @@ BitVector getReservedRegs(const MachineFunction &MF) const override; + unsigned getCSRFirstUseCost() const override { return 1 << 13; } + /// We require the register scavenger. bool requiresRegisterScavenging(const MachineFunction &MF) const override { return true; Index: lib/Target/X86/X86RegisterInfo.h =================================================================== --- lib/Target/X86/X86RegisterInfo.h +++ lib/Target/X86/X86RegisterInfo.h @@ -105,6 +105,8 @@ CallingConv::ID) const override; const uint32_t *getNoPreservedMask() const override; + unsigned getCSRFirstUseCost() const override { return 1 << 13; } + // Calls involved in thread-local variable lookup save more registers than // normal calls, so they need a different mask to represent this. const uint32_t *getDarwinTLSCallPreservedMask() const; Index: test/CodeGen/ARM/divmod-eabi.ll =================================================================== --- test/CodeGen/ARM/divmod-eabi.ll +++ test/CodeGen/ARM/divmod-eabi.ll @@ -225,6 +225,7 @@ ; DARWIN: __modsi3 ; DARWIN: mov [[sum:r[0-9]+]], r0 ; WINDOWS: __rt_sdiv +; WINDOWS: mov r0, r1 ; WINDOWS: mov [[rem:r[0-9]+]], r1 %rem1 = srem i32 %b, %rem ; EABI: __aeabi_idivmod Index: test/CodeGen/X86/atom-fixup-lea2.ll =================================================================== --- test/CodeGen/X86/atom-fixup-lea2.ll +++ test/CodeGen/X86/atom-fixup-lea2.ll @@ -1,6 +1,6 @@ ; RUN: llc < %s -mcpu=atom -mtriple=i686-linux | FileCheck %s ; CHECK:BB#5 -; CHECK-NEXT:leal +; CHECK-NEXT:addl 12(%esp), %esi ; CHECK-NEXT:leal ; CHECK-NEXT:leal ; CHECK-NEXT:movl Index: test/CodeGen/X86/block-placement.ll =================================================================== --- test/CodeGen/X86/block-placement.ll +++ test/CodeGen/X86/block-placement.ll @@ -1067,7 +1067,6 @@ ; CHECK-LABEL: test_cold_calls: ; CHECK: %entry ; CHECK: %else -; CHECK: %exit ; CHECK: %then entry: Index: test/CodeGen/X86/csr-split1.ll =================================================================== --- test/CodeGen/X86/csr-split1.ll +++ test/CodeGen/X86/csr-split1.ll @@ -0,0 +1,82 @@ +; RUN: llc -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s +; Check CSR split can work properly for tests below. + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +@cond = common local_unnamed_addr global i64 0, align 8 +@ret = common local_unnamed_addr global i64 0, align 8 +@p = common local_unnamed_addr global i64* null, align 8 +declare void @foo(i64, i64, i64) + +; After CSR split, ShrinkWrap is enabled and prologue are moved from entry +; to BB#1. +; +; CHECK-LABEL: test1: +; CHECK: .LBB0_1: +; CHECK-NEXT: push +; CHECK-NEXT: push +; CHECK-NEXT: push +; CHECK: callq foo +; CHECK: pop +; CHECK-NEXT: pop +; CHECK-NEXT: pop +; CHECK-NEXT: jmp .LBB0_2 +define void @test1(i64 %i, i64 %j, i64 %k) nounwind { +entry: + %t0 = load i64, i64* @cond, align 8 + %tobool = icmp eq i64 %t0, 0 + br i1 %tobool, label %if.end, label %if.then, !prof !0 + +if.then: ; preds = %entry + tail call void @foo(i64 %i, i64 %j, i64 %k) + %add = add nsw i64 %j, %i + %add1 = add nsw i64 %add, %k + store i64 %add1, i64* @ret, align 8 + br label %if.end + +if.end: ; preds = %entry, %if.then + %t1 = load i64*, i64** @p, align 8 + store volatile i64 3, i64* %t1, align 8 + ret void +} +!0 = !{!"branch_weights", i32 2000, i32 1} + +; After CSR split, even if ShrinkWrap is not enabled because stack +; alloc space is used. param passing moves still can be moved from +; prologue to BB#1. +; +; CHECK-LABEL: test2: +; CHECK-NEXT: # BB#0: +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: movq %rsp, %rbp +; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: cmpq +; CHECK-NEXT: jne .LBB1_1 +; CHECK: .LBB1_1: +; CHECK-NEXT: movq %rdi, +; CHECK-NEXT: movq %rsi, +; CHECK-NEXT: movq %rdx, +; CHECK-NEXT: callq foo +; +define void @test2(i64 %i, i64 %j, i64 %k) nounwind { +entry: + %t0 = load i64, i64* @cond, align 8 + %tobool = icmp eq i64 %t0, 0 + br i1 %tobool, label %if.end, label %if.then, !prof !1 + +if.then: ; preds = %entry + tail call void @foo(i64 %i, i64 %j, i64 %k) + %add = add nsw i64 %j, %i + %add1 = add nsw i64 %add, %k + store i64 %add1, i64* @ret, align 8 + br label %if.end + +if.end: ; preds = %entry, %if.then + %t1 = alloca [3 x i8], align 16 + store [3 x i8]* %t1, [3 x i8]** bitcast (i64** @p to [3 x i8]**), align 8 + ret void +} +!1 = !{!"branch_weights", i32 2000, i32 1} Index: test/CodeGen/X86/mul-i1024.ll =================================================================== --- test/CodeGen/X86/mul-i1024.ll +++ test/CodeGen/X86/mul-i1024.ll @@ -4460,7 +4460,8 @@ ; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) # 8-byte Spill -; X64-NEXT: movq %r14, %r9 +; X64-NEXT: movq %r14, %rdi +; X64-NEXT: movq %rdi, %r9 ; X64-NEXT: addq %rax, %r9 ; X64-NEXT: movq %rcx, %rax ; X64-NEXT: adcq %rdx, %rax @@ -4469,7 +4470,7 @@ ; X64-NEXT: movq %rax, %rbp ; X64-NEXT: movq %r11, %rax ; X64-NEXT: movq %r11, -{{[0-9]+}}(%rsp) # 8-byte Spill -; X64-NEXT: addq %r14, %rax +; X64-NEXT: addq %rdi, %rax ; X64-NEXT: adcq %rcx, %r15 ; X64-NEXT: movq %r15, {{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: movq (%r10), %rax @@ -4477,9 +4478,8 @@ ; X64-NEXT: xorl %r15d, %r15d ; X64-NEXT: mulq %r15 ; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %r14, %rax -; X64-NEXT: movq %r14, %rdi +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: addq %rdi, %rax ; X64-NEXT: movq %rsi, %rax ; X64-NEXT: adcq %rcx, %rax ; X64-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill @@ -4489,8 +4489,7 @@ ; X64-NEXT: xorl %r8d, %r8d ; X64-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) # 8-byte Spill -; X64-NEXT: movq %rbx, %rcx -; X64-NEXT: movq %rbx, %r14 +; X64-NEXT: movq %r14, %rcx ; X64-NEXT: addq %rax, %rcx ; X64-NEXT: movq %rsi, %rax ; X64-NEXT: adcq %rdx, %rax @@ -5638,8 +5637,8 @@ ; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq 72(%rsi), %rbx -; X64-NEXT: movq %rbx, %rax +; X64-NEXT: movq 72(%rsi), %rax +; X64-NEXT: movq %rax, %rbx ; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: movq %rax, %rbp @@ -5812,9 +5811,9 @@ ; X64-NEXT: addq %r13, %r9 ; X64-NEXT: adcq %rdi, %r8 ; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rsi # 8-byte Reload -; X64-NEXT: movq {{[0-9]+}}(%rsp), %rbx # 8-byte Reload -; X64-NEXT: imulq %rbx, %rsi -; X64-NEXT: movq %rbx, %rax +; X64-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload +; X64-NEXT: imulq %rax, %rsi +; X64-NEXT: movq %rax, %rbx ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload ; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rax, %r10 Index: test/CodeGen/X86/mul-i512.ll =================================================================== --- test/CodeGen/X86/mul-i512.ll +++ test/CodeGen/X86/mul-i512.ll @@ -956,15 +956,15 @@ ; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rdx, %r9 ; X64-NEXT: movq %r9, -{{[0-9]+}}(%rsp) # 8-byte Spill +; X64-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: movq %rax, %r15 -; X64-NEXT: movq %r15, -{{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: addq %r10, %r15 ; X64-NEXT: adcq %r13, %r9 ; X64-NEXT: addq %rbp, %r15 ; X64-NEXT: adcq %rsi, %r9 ; X64-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) # 8-byte Spill -; X64-NEXT: movq (%rdi), %r14 -; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq (%rdi), %rax +; X64-NEXT: movq %rax, %r14 ; X64-NEXT: mulq %rbx ; X64-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: movq %rdx, %r11 Index: test/CodeGen/X86/ragreedy-bug.ll =================================================================== --- test/CodeGen/X86/ragreedy-bug.ll +++ test/CodeGen/X86/ragreedy-bug.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -mtriple=x86_64-apple-macosx -regalloc=greedy | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-macosx -regalloc=greedy -regalloc-csr-first-time-cost=0 | FileCheck %s ; This testing case is reduced from 197.parser prune_match function. ; We make sure register copies are not generated on isupper.exit blocks. Index: test/CodeGen/X86/ragreedy-hoist-spill.ll =================================================================== --- test/CodeGen/X86/ragreedy-hoist-spill.ll +++ test/CodeGen/X86/ragreedy-hoist-spill.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -mtriple=x86_64-apple-macosx -regalloc=greedy | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-macosx -regalloc=greedy -regalloc-csr-first-time-cost=0 | FileCheck %s ; This testing case is reduced from 254.gap SyFgets function. ; We make sure a spill is hoisted to a cold BB inside the hotter outer loop. Index: test/CodeGen/X86/sjlj-eh.ll =================================================================== --- test/CodeGen/X86/sjlj-eh.ll +++ test/CodeGen/X86/sjlj-eh.ll @@ -56,10 +56,10 @@ ; CHECK: calll __Z20function_that_throwsv ; _Unwind_SjLj_Unregister(&UFC); ; CHECK: leal -64(%ebp), %eax +; CHECK: pushl %eax ; CHECK: calll __Unwind_SjLj_Unregister ; ; CHECK: [[RESUME]]: -; CHECK: leal -64(%ebp), %esi ; assert(UFC.__callsite <= 1); ; CHECK: movl -60(%ebp), %eax ; CHECK: cmpl $1, %eax Index: test/CodeGen/X86/x86-shrink-wrapping.ll =================================================================== --- test/CodeGen/X86/x86-shrink-wrapping.ll +++ test/CodeGen/X86/x86-shrink-wrapping.ll @@ -941,7 +941,6 @@ ; Make sure the epilogue happens in the exit block. ; CHECK-NOT: popq ; CHECK: popq -; CHECK-NEXT: popq ; CHECK-NEXT: retq define i32 @irreducibleCFG() #4 { entry: Index: test/DebugInfo/X86/live-debug-values.ll =================================================================== --- test/DebugInfo/X86/live-debug-values.ll +++ test/DebugInfo/X86/live-debug-values.ll @@ -1,4 +1,4 @@ -; RUN: llc -filetype=asm %s -o - | FileCheck %s +; RUN: llc -filetype=asm -regalloc-csr-first-time-cost=0 %s -o - | FileCheck %s ; Test the extension of debug ranges from predecessors. ; Generated from the source file LiveDebugValues.c: