Index: lib/Target/ARM/ARMFrameLowering.cpp =================================================================== --- lib/Target/ARM/ARMFrameLowering.cpp +++ lib/Target/ARM/ARMFrameLowering.cpp @@ -1920,9 +1920,23 @@ << "\n"); } + // Avoid spilling LR in Thumb1 if there's a tail call: it's expensive to + // restore LR in that case. + bool ExpensiveLRRestore = false; + if (!LRSpilled && AFI->isThumb1OnlyFunction()) { + for (MachineBasicBlock &MBB : MF) { + if (!MBB.empty() && + (MBB.back().getOpcode() == ARM::TCRETURNdi || + MBB.back().getOpcode() == ARM::TCRETURNri)) { + ExpensiveLRRestore = true; + break; + } + } + } + // If LR is not spilled, but at least one of R4, R5, R6, and R7 is spilled. // Spill LR as well so we can fold BX_RET to the registers restore (LDM). - if (!LRSpilled && CS1Spilled) { + if (!LRSpilled && CS1Spilled && !ExpensiveLRRestore) { SavedRegs.set(ARM::LR); NumGPRSpills++; SmallVectorImpl::iterator LRPos; @@ -1948,7 +1962,8 @@ // Windows on ARM, accept R11 (frame pointer) if (!AFI->isThumbFunction() || (STI.isTargetWindows() && Reg == ARM::R11) || - isARMLowRegister(Reg) || Reg == ARM::LR) { + isARMLowRegister(Reg) || + (Reg == ARM::LR && !ExpensiveLRRestore)) { SavedRegs.set(Reg); LLVM_DEBUG(dbgs() << "Spilling " << printReg(Reg, TRI) << " to make up alignment\n"); Index: test/CodeGen/ARM/v8m-tail-call.ll =================================================================== --- test/CodeGen/ARM/v8m-tail-call.ll +++ test/CodeGen/ARM/v8m-tail-call.ll @@ -1,34 +1,65 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc %s -o - -mtriple=thumbv8m.base | FileCheck %s declare i32 @g(...) declare i32 @h0(i32, i32, i32, i32) define hidden i32 @f0() { +; CHECK-LABEL: f0: +; CHECK: @ %bb.0: +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: bl g +; CHECK-NEXT: movs r1, #1 +; CHECK-NEXT: movs r2, #2 +; CHECK-NEXT: movs r3, #3 +; CHECK-NEXT: ldr r7, [sp, #4] +; CHECK-NEXT: mov lr, r7 +; CHECK-NEXT: pop {r7} +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: b h0 %1 = tail call i32 bitcast (i32 (...)* @g to i32 ()*)() %2 = tail call i32 @h0(i32 %1, i32 1, i32 2, i32 3) ret i32 %2 -; CHECK-LABEL: f0 -; CHECK: ldr [[POP:r[4567]]], [sp, #4] -; CHECK-NEXT: mov lr, [[POP]] -; CHECK-NEXT: pop {{.*}}[[POP]] -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: b h0 } declare i32 @h1(i32) define hidden i32 @f1() { +; CHECK-LABEL: f1: +; CHECK: @ %bb.0: +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: bl g +; CHECK-NEXT: pop {r7} +; CHECK-NEXT: pop {r1} +; CHECK-NEXT: mov lr, r1 +; CHECK-NEXT: b h1 %1 = tail call i32 bitcast (i32 (...)* @g to i32 ()*)() %2 = tail call i32 @h1(i32 %1) ret i32 %2 -; CHECK-LABEL: f1 -; CHECK: pop {r7} -; CHECK: pop {r1} -; CHECK: mov lr, r1 -; CHECK: b h1 } declare i32 @h2(i32, i32, i32, i32, i32) define hidden i32 @f2(i32, i32, i32, i32, i32) { +; CHECK-LABEL: f2: +; CHECK: @ %bb.0: +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: mov r4, r3 +; CHECK-NEXT: mov r5, r2 +; CHECK-NEXT: mov r6, r1 +; CHECK-NEXT: bl g +; CHECK-NEXT: cbz r0, .LBB2_2 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: mov r2, r5 +; CHECK-NEXT: mov r3, r4 +; CHECK-NEXT: ldr r4, [sp, #12] +; CHECK-NEXT: mov lr, r4 +; CHECK-NEXT: pop {r4, r5, r6} +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: b h2 +; CHECK-NEXT: .LBB2_2: +; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: mvns r0, r0 +; CHECK-NEXT: pop {r4, r5, r6, pc} %6 = tail call i32 bitcast (i32 (...)* @g to i32 ()*)() %7 = icmp eq i32 %6, 0 br i1 %7, label %10, label %8 @@ -38,12 +69,6 @@ %11 = phi i32 [ %9, %8 ], [ -1, %5 ] ret i32 %11 -; CHECK-LABEL: f2 -; CHECK: ldr [[POP:r[4567]]], [sp, #12] -; CHECK-NEXT: mov lr, [[POP]] -; CHECK-NEXT: pop {{.*}}[[POP]] -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: b h2 } ; Make sure that tail calls to function pointers that require r0-r3 for argument @@ -51,7 +76,17 @@ @fnptr = global i32 (i32, i32, i32, i32)* null define i32 @test3() { ; CHECK-LABEL: test3: -; CHECK: blx {{r[0-9]+}} +; CHECK: @ %bb.0: +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: movw r0, :lower16:fnptr +; CHECK-NEXT: movt r0, :upper16:fnptr +; CHECK-NEXT: ldr r4, [r0] +; CHECK-NEXT: movs r0, #1 +; CHECK-NEXT: movs r1, #2 +; CHECK-NEXT: movs r2, #3 +; CHECK-NEXT: movs r3, #4 +; CHECK-NEXT: blx r4 +; CHECK-NEXT: pop {r4, pc} %1 = load i32 (i32, i32, i32, i32)*, i32 (i32, i32, i32, i32)** @fnptr %2 = tail call i32 %1(i32 1, i32 2, i32 3, i32 4) ret i32 %2 @@ -60,7 +95,17 @@ @fnptr2 = global i32 (i32, i32, i64)* null define i32 @test4() { ; CHECK-LABEL: test4: -; CHECK: blx {{r[0-9]+}} +; CHECK: @ %bb.0: +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: movw r0, :lower16:fnptr2 +; CHECK-NEXT: movt r0, :upper16:fnptr2 +; CHECK-NEXT: ldr r4, [r0] +; CHECK-NEXT: movs r0, #1 +; CHECK-NEXT: movs r1, #2 +; CHECK-NEXT: movs r2, #3 +; CHECK-NEXT: movs r3, #0 +; CHECK-NEXT: blx r4 +; CHECK-NEXT: pop {r4, pc} %1 = load i32 (i32, i32, i64)*, i32 (i32, i32, i64)** @fnptr2 %2 = tail call i32 %1(i32 1, i32 2, i64 3) ret i32 %2 @@ -72,9 +117,13 @@ @fnptr3 = global i32 (i32, i32)* null define i32 @test5() { ; CHECK-LABEL: test5: -; CHECK: ldr [[REG:r[0-9]+]] -; CHECK: bx [[REG]] -; CHECK-NOT: blx [[REG]] +; CHECK: @ %bb.0: +; CHECK-NEXT: movw r0, :lower16:fnptr3 +; CHECK-NEXT: movt r0, :upper16:fnptr3 +; CHECK-NEXT: ldr r2, [r0] +; CHECK-NEXT: movs r0, #1 +; CHECK-NEXT: movs r1, #2 +; CHECK-NEXT: bx r2 %1 = load i32 (i32, i32)*, i32 (i32, i32)** @fnptr3 %2 = tail call i32 %1(i32 1, i32 2) ret i32 %2 @@ -84,9 +133,14 @@ @fnptr4 = global i32 (i32, i64)* null define i32 @test6() { ; CHECK-LABEL: test6: -; CHECK: ldr [[REG:r[0-9]+]] -; CHECK: bx [[REG]] -; CHECK-NOT: blx [[REG]] +; CHECK: @ %bb.0: +; CHECK-NEXT: movw r0, :lower16:fnptr4 +; CHECK-NEXT: movt r0, :upper16:fnptr4 +; CHECK-NEXT: ldr r1, [r0] +; CHECK-NEXT: movs r0, #1 +; CHECK-NEXT: movs r2, #2 +; CHECK-NEXT: movs r3, #0 +; CHECK-NEXT: bx r1 %1 = load i32 (i32, i64)*, i32 (i32, i64)** @fnptr4 %2 = tail call i32 %1(i32 1, i64 2) ret i32 %2 @@ -96,8 +150,12 @@ ; tail-call optimized. define i32 @test7() { ; CHECK-LABEL: test7: -; CHECK: b bar -; CHECK-NOT: bl bar +; CHECK: @ %bb.0: +; CHECK-NEXT: movs r0, #1 +; CHECK-NEXT: movs r1, #2 +; CHECK-NEXT: movs r2, #3 +; CHECK-NEXT: movs r3, #4 +; CHECK-NEXT: b bar %tail = tail call i32 @bar(i32 1, i32 2, i32 3, i32 4) ret i32 %tail } @@ -109,6 +167,33 @@ %struct.S = type { i32 } define void @test8(i32 (i32, i32, i32)* nocapture %fn, i32 %x) local_unnamed_addr { +; CHECK-LABEL: test8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: mov r4, r1 +; CHECK-NEXT: str r0, [sp] @ 4-byte Spill +; CHECK-NEXT: bl test8_u +; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: ldr r6, [r0] +; CHECK-NEXT: movs r7, #0 +; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: bl test8_h +; CHECK-NEXT: mov r1, r0 +; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: mov r2, r7 +; CHECK-NEXT: bl test8_g +; CHECK-NEXT: str r4, [r5] +; CHECK-NEXT: movs r0, #1 +; CHECK-NEXT: movs r1, #2 +; CHECK-NEXT: movs r2, #3 +; CHECK-NEXT: ldr r3, [sp] @ 4-byte Reload +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: ldr r4, [sp, #16] +; CHECK-NEXT: mov lr, r4 +; CHECK-NEXT: pop {r4, r5, r6, r7} +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: bx r3 entry: %call = tail call %struct.S* bitcast (%struct.S* (...)* @test8_u to %struct.S* ()*)() %a = getelementptr inbounds %struct.S, %struct.S* %call, i32 0, i32 0 @@ -125,6 +210,28 @@ declare i32 @test8_g(i32, i32, i32) declare i32 @test8_h(i32) -; CHECK: str r0, [sp] @ 4-byte Spill -; CHECK: ldr r3, [sp] @ 4-byte Reload -; CHECK: bx r3 + +; Check that we don't introduce an unnecessary spill of lr. +declare i32 @h9(i32, i32, i32, i32) +define i32 @test9(i32* %x, i32* %y, i32* %z, i32* %a) { +; CHECK-LABEL: test9: +; CHECK: @ %bb.0: +; CHECK-NEXT: push {r4, r7} +; CHECK-NEXT: ldr r4, [r3] +; CHECK-NEXT: ldr r3, [r3, #4] +; CHECK-NEXT: adds r3, r4, r3 +; CHECK-NEXT: ldr r1, [r1] +; CHECK-NEXT: ldr r0, [r0] +; CHECK-NEXT: ldr r2, [r2] +; CHECK-NEXT: pop {r4, r7} +; CHECK-NEXT: b h9 + %zz = load i32, i32* %z + %xx = load i32, i32* %x + %yy = load i32, i32* %y + %aa1 = load i32, i32* %a + %a2 = getelementptr i32, i32* %a, i32 1 + %aa2 = load i32, i32* %a2 + %aa = add i32 %aa1, %aa2 + %r = tail call i32 @h9(i32 %xx, i32 %yy, i32 %zz, i32 %aa) + ret i32 %r +}