Index: lib/Target/ARM/ARMISelLowering.cpp
===================================================================
--- lib/Target/ARM/ARMISelLowering.cpp
+++ lib/Target/ARM/ARMISelLowering.cpp
@@ -2306,13 +2306,30 @@
 
   assert(Subtarget->supportsTailCall());
 
-  // Tail calls to function pointers cannot be optimized for Thumb1 if the args
-  // to the call take up r0-r3. The reason is that there are no legal registers
-  // left to hold the pointer to the function to be called.
-  if (Subtarget->isThumb1Only() && Outs.size() >= 4 &&
-      !isa<GlobalAddressSDNode>(Callee.getNode()))
+  if (Subtarget->isThumb1Only()) {
+    // FIXME: This approximation isn't right for non-ELF targets.
+    if (!Subtarget->isTargetELF())
+      return false;
+    bool IsImmediateCall = isa<GlobalAddressSDNode>(Callee.getNode()) ||
+                           isa<ExternalSymbolSDNode>(Callee.getNode());
+
+    // Tail calls to function pointers cannot be optimized for Thumb1 if the
+    // args to the call take up r0-r3. The reason is that there are no legal
+    // registers left to hold the pointer to the function to be called. r12 is
+    // free, but it would be tricky to emit the right sequence because LLVM
+    // doesn't treat it as allocatable.
+    if (Outs.size() >= 4 && (!Subtarget->hasV8MBaselineOps() ||
+                             !IsImmediateCall))
       return false;
 
+    // Don't try to emit a tail call on Thumb1 if the callee is a known global;
+    // we would be forced to load the address to a GPR. v8m is the exception: it
+    // supports the required immediate branch. (This restriction shouldn't be
+    // necessary for correctness; it's just a codesize optimization.)
+    if (!Subtarget->hasV8MBaselineOps() && IsImmediateCall)
+      return false;
+  }
+
   // Look for obvious safe cases to perform tail call optimization that do not
   // require ABI changes. This is what gcc calls sibcall.
 
@@ -2373,6 +2390,12 @@
     CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
     CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
     if (CCInfo.getNextStackOffset()) {
+      // On Thumb1, don't tail call functions which pass data on the stack;
+      // we need to be able to reverse the transform after isel to avoid
+      // increasing codesize.  (See
+      // Thumb1FrameLowering::restoreCalleeSavedRegisters.)
+      if (Subtarget->isThumb1Only())
+        return false;
       // Check if the arguments are already laid out in the right way as
       // the caller's fixed stack objects.
       MachineFrameInfo &MFI = MF.getFrameInfo();
Index: lib/Target/ARM/ARMSubtarget.cpp
===================================================================
--- lib/Target/ARM/ARMSubtarget.cpp
+++ lib/Target/ARM/ARMSubtarget.cpp
@@ -206,28 +206,10 @@
   if (isTargetNaCl() || isAAPCS16_ABI())
     stackAlignment = 16;
 
-  // FIXME: Completely disable sibcall for Thumb1 since ThumbRegisterInfo::
-  // emitEpilogue is not ready for them. Thumb tail calls also use t2B, as
-  // the Thumb1 16-bit unconditional branch doesn't have sufficient relocation
-  // support in the assembler and linker to be used. This would need to be
-  // fixed to fully support tail calls in Thumb1.
-  //
-  // For ARMv8-M, we /do/ implement tail calls.  Doing this is tricky for v8-M
-  // baseline, since the LDM/POP instruction on Thumb doesn't take LR.  This
-  // means if we need to reload LR, it takes extra instructions, which outweighs
-  // the value of the tail call; but here we don't know yet whether LR is going
-  // to be used. We take the optimistic approach of generating the tail call and
-  // perhaps taking a hit if we need to restore the LR.
-
-  // Thumb1 PIC calls to external symbols use BX, so they can be tail calls,
-  // but we need to make sure there are enough registers; the only valid
-  // registers are the 4 used for parameters.  We don't currently do this
-  // case.
-
-  SupportsTailCall = !isThumb() || hasV8MBaselineOps();
-
   if (isTargetMachO() && isTargetIOS() && getTargetTriple().isOSVersionLT(5, 0))
     SupportsTailCall = false;
+  else
+    SupportsTailCall = true;
 
   switch (IT) {
   case DefaultIT:
Index: lib/Target/ARM/Thumb1FrameLowering.cpp
===================================================================
--- lib/Target/ARM/Thumb1FrameLowering.cpp
+++ lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -962,15 +962,10 @@
 
     if (Reg == ARM::LR) {
       Info.setRestored(false);
-      if (!MBB.succ_empty() ||
-          MI->getOpcode() == ARM::TCRETURNdi ||
-          MI->getOpcode() == ARM::TCRETURNri)
+      if (!MBB.succ_empty())
         // LR may only be popped into PC, as part of return sequence.
         // If this isn't the return sequence, we'll need emitPopSpecialFixUp
         // to restore LR the hard way.
-        // FIXME: if we don't pass any stack arguments it would be actually
-        // advantageous *and* correct to do the conversion to an ordinary call
-        // instruction here.
         continue;
       // Special epilogue for vararg functions. See emitEpilogue
       if (isVarArg)
@@ -978,6 +973,20 @@
       // ARMv4T requires BX, see emitEpilogue
       if (!STI.hasV5TOps())
         continue;
+      // Tailcall optimization failed; change TCRETURN to a tBL. We could
+      // fall back to emitPopSpecialFixUp instead, but that's more code.
+      // Assumes tail-calls don't pass any arguments on the stack;
+      // this is enforced in
+      // ARMTargetLowering::IsEligibleForTailCallOptimization.
+      if (MI->getOpcode() == ARM::TCRETURNdi ||
+          MI->getOpcode() == ARM::TCRETURNri) {
+        unsigned Opcode = MI->getOpcode() == ARM::TCRETURNdi
+                          ? ARM::tBL : ARM::tBLXr;
+        MachineInstrBuilder BL = BuildMI(MF, DL, TII.get(Opcode));
+        BL.add(predOps(ARMCC::AL));
+        BL.add(MI->getOperand(0));
+        MBB.insert(MI, &*BL);
+      }
 
       // Pop LR into PC.
       Reg = ARM::PC;
Index: test/CodeGen/ARM/thumb_indirect_calls.ll
===================================================================
--- test/CodeGen/ARM/thumb_indirect_calls.ll
+++ test/CodeGen/ARM/thumb_indirect_calls.ll
@@ -4,7 +4,7 @@
 @f = common global void (i32)* null, align 4
 
 ; CHECK-LABEL: foo:
-define void @foo(i32 %x) {
+define void @foo(i32 %x) "disable-tail-calls"="true" {
 entry:
   %0 = load void (i32)*, void (i32)** @f, align 4
   tail call void %0(i32 %x)
@@ -22,7 +22,7 @@
 }
 
 ; CHECK-LABEL: bar:
-define void @bar(void (i32)* nocapture %g, i32 %x, void (i32)* nocapture %h) {
+define void @bar(void (i32)* nocapture %g, i32 %x, void (i32)* nocapture %h) "disable-tail-calls"="true" {
 entry:
   tail call void %g(i32 %x)
   tail call void %h(i32 %x)
Index: test/CodeGen/ARM/v8m-tail-call.ll
===================================================================
--- test/CodeGen/ARM/v8m-tail-call.ll
+++ test/CodeGen/ARM/v8m-tail-call.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc %s -o - -mtriple=thumbv8m.base | FileCheck %s
+; RUN: llc %s -o - -mtriple=thumbv6m-eabi | FileCheck %s -check-prefix=V6M
 
 declare i32 @g(...)
 
@@ -12,11 +13,19 @@
 ; CHECK-NEXT:    movs r1, #1
 ; CHECK-NEXT:    movs r2, #2
 ; CHECK-NEXT:    movs r3, #3
-; CHECK-NEXT:    ldr r7, [sp, #4]
-; CHECK-NEXT:    mov lr, r7
-; CHECK-NEXT:    pop {r7}
-; CHECK-NEXT:    add sp, #4
-; CHECK-NEXT:    b h0
+; CHECK-NEXT:    bl h0
+; CHECK-NEXT:    pop {r7, pc}
+;
+; V6M-LABEL: f0:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    .save {r7, lr}
+; V6M-NEXT:    push {r7, lr}
+; V6M-NEXT:    bl g
+; V6M-NEXT:    movs r1, #1
+; V6M-NEXT:    movs r2, #2
+; V6M-NEXT:    movs r3, #3
+; V6M-NEXT:    bl h0
+; V6M-NEXT:    pop {r7, pc}
   %1 = tail call i32 bitcast (i32 (...)* @g to i32 ()*)()
   %2 = tail call i32 @h0(i32 %1, i32 1, i32 2, i32 3)
   ret i32 %2
@@ -28,10 +37,16 @@
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    bl g
-; CHECK-NEXT:    pop {r7}
-; CHECK-NEXT:    pop {r1}
-; CHECK-NEXT:    mov lr, r1
-; CHECK-NEXT:    b h1
+; CHECK-NEXT:    bl h1
+; CHECK-NEXT:    pop {r7, pc}
+;
+; V6M-LABEL: f1:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    .save {r7, lr}
+; V6M-NEXT:    push {r7, lr}
+; V6M-NEXT:    bl g
+; V6M-NEXT:    bl h1
+; V6M-NEXT:    pop {r7, pc}
   %1 = tail call i32 bitcast (i32 (...)* @g to i32 ()*)()
   %2 = tail call i32 @h1(i32 %1)
   ret i32 %2
@@ -42,24 +57,55 @@
 ; CHECK-LABEL: f2:
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NEXT:    sub sp, #8
 ; CHECK-NEXT:    mov r4, r3
 ; CHECK-NEXT:    mov r5, r2
 ; CHECK-NEXT:    mov r6, r1
 ; CHECK-NEXT:    bl g
 ; CHECK-NEXT:    cbz r0, .LBB2_2
 ; CHECK-NEXT:  @ %bb.1:
+; CHECK-NEXT:    ldr r1, [sp, #24]
+; CHECK-NEXT:    mov r2, sp
+; CHECK-NEXT:    str r1, [r2]
 ; CHECK-NEXT:    mov r1, r6
 ; CHECK-NEXT:    mov r2, r5
 ; CHECK-NEXT:    mov r3, r4
-; CHECK-NEXT:    ldr r4, [sp, #12]
-; CHECK-NEXT:    mov lr, r4
-; CHECK-NEXT:    pop {r4, r5, r6}
-; CHECK-NEXT:    add sp, #4
-; CHECK-NEXT:    b h2
+; CHECK-NEXT:    bl h2
+; CHECK-NEXT:    add sp, #8
+; CHECK-NEXT:    pop {r4, r5, r6, pc}
 ; CHECK-NEXT:  .LBB2_2:
 ; CHECK-NEXT:    movs r0, #0
 ; CHECK-NEXT:    mvns r0, r0
+; CHECK-NEXT:    add sp, #8
 ; CHECK-NEXT:    pop {r4, r5, r6, pc}
+;
+; V6M-LABEL: f2:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    .save {r4, r5, r6, lr}
+; V6M-NEXT:    push {r4, r5, r6, lr}
+; V6M-NEXT:    .pad #8
+; V6M-NEXT:    sub sp, #8
+; V6M-NEXT:    mov r4, r3
+; V6M-NEXT:    mov r5, r2
+; V6M-NEXT:    mov r6, r1
+; V6M-NEXT:    bl g
+; V6M-NEXT:    cmp r0, #0
+; V6M-NEXT:    beq .LBB2_2
+; V6M-NEXT:  @ %bb.1:
+; V6M-NEXT:    ldr r1, [sp, #24]
+; V6M-NEXT:    mov r2, sp
+; V6M-NEXT:    str r1, [r2]
+; V6M-NEXT:    mov r1, r6
+; V6M-NEXT:    mov r2, r5
+; V6M-NEXT:    mov r3, r4
+; V6M-NEXT:    bl h2
+; V6M-NEXT:    add sp, #8
+; V6M-NEXT:    pop {r4, r5, r6, pc}
+; V6M-NEXT:  .LBB2_2:
+; V6M-NEXT:    movs r0, #0
+; V6M-NEXT:    mvns r0, r0
+; V6M-NEXT:    add sp, #8
+; V6M-NEXT:    pop {r4, r5, r6, pc}
   %6 = tail call i32 bitcast (i32 (...)* @g to i32 ()*)()
   %7 = icmp eq i32 %6, 0
   br i1 %7, label %10, label %8
@@ -87,6 +133,23 @@
 ; CHECK-NEXT:    movs r3, #4
 ; CHECK-NEXT:    blx r4
 ; CHECK-NEXT:    pop {r4, pc}
+;
+; V6M-LABEL: test3:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    .save {r4, lr}
+; V6M-NEXT:    push {r4, lr}
+; V6M-NEXT:    ldr r0, .LCPI3_0
+; V6M-NEXT:    ldr r4, [r0]
+; V6M-NEXT:    movs r0, #1
+; V6M-NEXT:    movs r1, #2
+; V6M-NEXT:    movs r2, #3
+; V6M-NEXT:    movs r3, #4
+; V6M-NEXT:    blx r4
+; V6M-NEXT:    pop {r4, pc}
+; V6M-NEXT:    .p2align 2
+; V6M-NEXT:  @ %bb.1:
+; V6M-NEXT:  .LCPI3_0:
+; V6M-NEXT:    .long fnptr
   %1 = load i32 (i32, i32, i32, i32)*, i32 (i32, i32, i32, i32)** @fnptr
   %2 = tail call i32 %1(i32 1, i32 2, i32 3, i32 4)
   ret i32 %2
@@ -106,6 +169,23 @@
 ; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    blx r4
 ; CHECK-NEXT:    pop {r4, pc}
+;
+; V6M-LABEL: test4:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    .save {r4, lr}
+; V6M-NEXT:    push {r4, lr}
+; V6M-NEXT:    ldr r0, .LCPI4_0
+; V6M-NEXT:    ldr r4, [r0]
+; V6M-NEXT:    movs r0, #1
+; V6M-NEXT:    movs r1, #2
+; V6M-NEXT:    movs r2, #3
+; V6M-NEXT:    movs r3, #0
+; V6M-NEXT:    blx r4
+; V6M-NEXT:    pop {r4, pc}
+; V6M-NEXT:    .p2align 2
+; V6M-NEXT:  @ %bb.1:
+; V6M-NEXT:  .LCPI4_0:
+; V6M-NEXT:    .long fnptr2
   %1 = load i32 (i32, i32, i64)*, i32 (i32, i32, i64)** @fnptr2
   %2 = tail call i32 %1(i32 1, i32 2, i64 3)
   ret i32 %2
@@ -124,6 +204,18 @@
 ; CHECK-NEXT:    movs r0, #1
 ; CHECK-NEXT:    movs r1, #2
 ; CHECK-NEXT:    bx r2
+;
+; V6M-LABEL: test5:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    ldr r0, .LCPI5_0
+; V6M-NEXT:    ldr r2, [r0]
+; V6M-NEXT:    movs r0, #1
+; V6M-NEXT:    movs r1, #2
+; V6M-NEXT:    bx r2
+; V6M-NEXT:    .p2align 2
+; V6M-NEXT:  @ %bb.1:
+; V6M-NEXT:  .LCPI5_0:
+; V6M-NEXT:    .long fnptr3
   %1 = load i32 (i32, i32)*, i32 (i32, i32)** @fnptr3
   %2 = tail call i32 %1(i32 1, i32 2)
   ret i32 %2
@@ -141,6 +233,19 @@
 ; CHECK-NEXT:    movs r2, #2
 ; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    bx r1
+;
+; V6M-LABEL: test6:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    ldr r0, .LCPI6_0
+; V6M-NEXT:    ldr r1, [r0]
+; V6M-NEXT:    movs r0, #1
+; V6M-NEXT:    movs r2, #2
+; V6M-NEXT:    movs r3, #0
+; V6M-NEXT:    bx r1
+; V6M-NEXT:    .p2align 2
+; V6M-NEXT:  @ %bb.1:
+; V6M-NEXT:  .LCPI6_0:
+; V6M-NEXT:    .long fnptr4
   %1 = load i32 (i32, i64)*, i32 (i32, i64)** @fnptr4
   %2 = tail call i32 %1(i32 1, i64 2)
   ret i32 %2
@@ -156,6 +261,17 @@
 ; CHECK-NEXT:    movs r2, #3
 ; CHECK-NEXT:    movs r3, #4
 ; CHECK-NEXT:    b bar
+;
+; V6M-LABEL: test7:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    .save {r7, lr}
+; V6M-NEXT:    push {r7, lr}
+; V6M-NEXT:    movs r0, #1
+; V6M-NEXT:    movs r1, #2
+; V6M-NEXT:    movs r2, #3
+; V6M-NEXT:    movs r3, #4
+; V6M-NEXT:    bl bar
+; V6M-NEXT:    pop {r7, pc}
   %tail = tail call i32 @bar(i32 1, i32 2, i32 3, i32 4)
   ret i32 %tail
 }
@@ -188,12 +304,36 @@
 ; CHECK-NEXT:    movs r1, #2
 ; CHECK-NEXT:    movs r2, #3
 ; CHECK-NEXT:    ldr r3, [sp] @ 4-byte Reload
+; CHECK-NEXT:    blx r3
 ; CHECK-NEXT:    add sp, #4
-; CHECK-NEXT:    ldr r4, [sp, #16]
-; CHECK-NEXT:    mov lr, r4
-; CHECK-NEXT:    pop {r4, r5, r6, r7}
-; CHECK-NEXT:    add sp, #4
-; CHECK-NEXT:    bx r3
+; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
+;
+; V6M-LABEL: test8:
+; V6M:       @ %bb.0: @ %entry
+; V6M-NEXT:    .save {r4, r5, r6, r7, lr}
+; V6M-NEXT:    push {r4, r5, r6, r7, lr}
+; V6M-NEXT:    .pad #4
+; V6M-NEXT:    sub sp, #4
+; V6M-NEXT:    mov r4, r1
+; V6M-NEXT:    str r0, [sp] @ 4-byte Spill
+; V6M-NEXT:    bl test8_u
+; V6M-NEXT:    mov r5, r0
+; V6M-NEXT:    ldr r6, [r0]
+; V6M-NEXT:    movs r7, #0
+; V6M-NEXT:    mov r0, r7
+; V6M-NEXT:    bl test8_h
+; V6M-NEXT:    mov r1, r0
+; V6M-NEXT:    mov r0, r6
+; V6M-NEXT:    mov r2, r7
+; V6M-NEXT:    bl test8_g
+; V6M-NEXT:    str r4, [r5]
+; V6M-NEXT:    movs r0, #1
+; V6M-NEXT:    movs r1, #2
+; V6M-NEXT:    movs r2, #3
+; V6M-NEXT:    ldr r3, [sp] @ 4-byte Reload
+; V6M-NEXT:    blx r3
+; V6M-NEXT:    add sp, #4
+; V6M-NEXT:    pop {r4, r5, r6, r7, pc}
 entry:
   %call = tail call %struct.S* bitcast (%struct.S* (...)* @test8_u to %struct.S* ()*)()
   %a = getelementptr inbounds %struct.S, %struct.S* %call, i32 0, i32 0
@@ -225,6 +365,19 @@
 ; CHECK-NEXT:    ldr r2, [r2]
 ; CHECK-NEXT:    pop {r4, r7}
 ; CHECK-NEXT:    b h9
+;
+; V6M-LABEL: test9:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    .save {r4, lr}
+; V6M-NEXT:    push {r4, lr}
+; V6M-NEXT:    ldr r4, [r3]
+; V6M-NEXT:    ldr r3, [r3, #4]
+; V6M-NEXT:    adds r3, r4, r3
+; V6M-NEXT:    ldr r1, [r1]
+; V6M-NEXT:    ldr r0, [r0]
+; V6M-NEXT:    ldr r2, [r2]
+; V6M-NEXT:    bl h9
+; V6M-NEXT:    pop {r4, pc}
   %zz = load i32, i32* %z
   %xx = load i32, i32* %x
   %yy = load i32, i32* %y