diff --git a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
--- a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -63,6 +63,7 @@
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
@@ -70,6 +71,7 @@
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -81,6 +83,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "tailcallelim"
@@ -92,10 +95,10 @@
 /// Scan the specified function for alloca instructions.
 /// If it contains any dynamic allocas, returns false.
 static bool canTRE(Function &F) {
-  // FIXME: The code generator produces really bad code when an 'escaping
-  // alloca' is changed from being a static alloca to being a dynamic alloca.
-  // Until this is resolved, disable this transformation if that would ever
-  // happen.  This bug is PR962.
+  // TODO: We don't do TRE if dynamic allocas are used.
+  // Dynamic allocas allocate stack space which should be
+  // deallocated before new iteration started. That is
+  // currently not implemented.
   return llvm::all_of(instructions(F), [](Instruction &I) {
     auto *AI = dyn_cast<AllocaInst>(&I);
     return !AI || AI->isStaticAlloca();
@@ -188,11 +191,9 @@
 };
 }
 
-static bool markTails(Function &F, bool &AllCallsAreTailCalls,
-                      OptimizationRemarkEmitter *ORE) {
+static bool markTails(Function &F, OptimizationRemarkEmitter *ORE) {
   if (F.callsFunctionThatReturnsTwice())
     return false;
-  AllCallsAreTailCalls = true;
 
   // The local stack holds all alloca instructions and all byval arguments.
   AllocaDerivedValueTracker Tracker;
@@ -275,11 +276,8 @@
         }
       }
 
-      if (!IsNoTail && Escaped == UNESCAPED && !Tracker.AllocaUsers.count(CI)) {
+      if (!IsNoTail && Escaped == UNESCAPED && !Tracker.AllocaUsers.count(CI))
         DeferredTails.push_back(CI);
-      } else {
-        AllCallsAreTailCalls = false;
-      }
     }
 
     for (auto *SuccBB : make_range(succ_begin(BB), succ_end(BB))) {
@@ -316,8 +314,6 @@
       LLVM_DEBUG(dbgs() << "Marked as tail call candidate: " << *CI << "\n");
       CI->setTailCall();
       Modified = true;
-    } else {
-      AllCallsAreTailCalls = false;
     }
   }
 
@@ -329,6 +325,14 @@
 /// instructions between the call and this instruction are movable.
 ///
 static bool canMoveAboveCall(Instruction *I, CallInst *CI, AliasAnalysis *AA) {
+  if (isa<DbgInfoIntrinsic>(I))
+    return true;
+
+  if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
+    if (II->getIntrinsicID() == Intrinsic::lifetime_end &&
+        llvm::findAllocaForValue(II->getArgOperand(1)))
+      return true;
+
   // FIXME: We can move load/store/call/free instructions above the call if the
   // call does not mod/ref the memory location being processed.
   if (I->mayHaveSideEffects())  // This also handles volatile loads.
@@ -395,7 +399,6 @@
   // createTailRecurseLoopHeader the first time we find a call we can eliminate.
   BasicBlock *HeaderBB = nullptr;
   SmallVector<PHINode *, 8> ArgumentPHIs;
-  bool RemovableCallsMustBeMarkedTail = false;
 
   // PHI node to store our return value.
   PHINode *RetPN = nullptr;
@@ -422,8 +425,7 @@
                           DomTreeUpdater &DTU)
       : F(F), TTI(TTI), AA(AA), ORE(ORE), DTU(DTU) {}
 
-  CallInst *findTRECandidate(BasicBlock *BB,
-                             bool CannotTailCallElimCallsMarkedTail);
+  CallInst *findTRECandidate(BasicBlock *BB);
 
   void createTailRecurseLoopHeader(CallInst *CI);
 
@@ -433,7 +435,9 @@
 
   void cleanupAndFinalize();
 
-  bool processBlock(BasicBlock &BB, bool CannotTailCallElimCallsMarkedTail);
+  bool processBlock(BasicBlock &BB);
+
+  Value *createTempForByValOperand(CallInst *CI, int OpndIdx);
 
 public:
   static bool eliminate(Function &F, const TargetTransformInfo *TTI,
@@ -442,8 +446,7 @@
 };
 } // namespace
 
-CallInst *TailRecursionEliminator::findTRECandidate(
-    BasicBlock *BB, bool CannotTailCallElimCallsMarkedTail) {
+CallInst *TailRecursionEliminator::findTRECandidate(BasicBlock *BB) {
   Instruction *TI = BB->getTerminator();
 
   if (&BB->front() == TI) // Make sure there is something before the terminator.
@@ -463,9 +466,9 @@
     --BBI;
   }
 
-  // If this call is marked as a tail call, and if there are dynamic allocas in
-  // the function, we cannot perform this optimization.
-  if (CI->isTailCall() && CannotTailCallElimCallsMarkedTail)
+  assert((!CI->isTailCall() || !CI->isNoTailCall()) &&
+         "Incompatible call site attributes(Tail,NoTail)");
+  if (!CI->isTailCall())
     return nullptr;
 
   // As a special case, detect code like this:
@@ -497,26 +500,13 @@
   BranchInst *BI = BranchInst::Create(HeaderBB, NewEntry);
   BI->setDebugLoc(CI->getDebugLoc());
 
-  // If this function has self recursive calls in the tail position where some
-  // are marked tail and some are not, only transform one flavor or another.
-  // We have to choose whether we move allocas in the entry block to the new
-  // entry block or not, so we can't make a good choice for both. We make this
-  // decision here based on whether the first call we found to remove is
-  // marked tail.
-  // NOTE: We could do slightly better here in the case that the function has
-  // no entry block allocas.
-  RemovableCallsMustBeMarkedTail = CI->isTailCall();
-
-  // If this tail call is marked 'tail' and if there are any allocas in the
-  // entry block, move them up to the new entry block.
-  if (RemovableCallsMustBeMarkedTail)
-    // Move all fixed sized allocas from HeaderBB to NewEntry.
-    for (BasicBlock::iterator OEBI = HeaderBB->begin(), E = HeaderBB->end(),
-                              NEBI = NewEntry->begin();
-         OEBI != E;)
-      if (AllocaInst *AI = dyn_cast<AllocaInst>(OEBI++))
-        if (isa<ConstantInt>(AI->getArraySize()))
-          AI->moveBefore(&*NEBI);
+  // Move all fixed sized allocas from HeaderBB to NewEntry.
+  for (BasicBlock::iterator OEBI = HeaderBB->begin(), E = HeaderBB->end(),
+                            NEBI = NewEntry->begin();
+       OEBI != E;)
+    if (AllocaInst *AI = dyn_cast<AllocaInst>(OEBI++))
+      if (isa<ConstantInt>(AI->getArraySize()))
+        AI->moveBefore(&*NEBI);
 
   // Now that we have created a new block, which jumps to the entry
   // block, insert a PHI node for each argument of the function.
@@ -581,6 +571,37 @@
   ++NumAccumAdded;
 }
 
+Value *TailRecursionEliminator::createTempForByValOperand(CallInst *CI,
+                                                          int OpndIdx) {
+  PointerType *ArgTy = cast<PointerType>(CI->getArgOperand(OpndIdx)->getType());
+  Type *AggTy = ArgTy->getElementType();
+  const DataLayout &DL = F.getParent()->getDataLayout();
+
+  // Calculate alignment of byVal operand.
+  Align Alignment(DL.getPrefTypeAlignment(AggTy));
+
+  // If the byval had an alignment specified, we *must* use at least that
+  // alignment, as it is required by the byval argument (and uses of the
+  // pointer inside the callee).
+  Alignment = max(Alignment, MaybeAlign(CI->getParamAlign(OpndIdx)));
+
+  // Create alloca for temporarily byval operands.
+  // Put alloca into the entry block.
+  Value *NewAlloca = new AllocaInst(
+      AggTy, DL.getAllocaAddrSpace(), nullptr, Alignment,
+      CI->getArgOperand(OpndIdx)->getName(), &*F.getEntryBlock().begin());
+
+  IRBuilder<> Builder(CI);
+  Value *Size = Builder.getInt64(DL.getTypeAllocSize(AggTy));
+
+  // Copy data from byvalue operand into the temporarily variable.
+  Builder.CreateMemCpy(NewAlloca, /*DstAlign*/ Alignment,
+                       CI->getArgOperand(OpndIdx),
+                       /*SrcAlign*/ Alignment, Size);
+
+  return NewAlloca;
+}
+
 bool TailRecursionEliminator::eliminateCall(CallInst *CI) {
   ReturnInst *Ret = cast<ReturnInst>(CI->getParent()->getTerminator());
 
@@ -619,14 +640,15 @@
   if (!HeaderBB)
     createTailRecurseLoopHeader(CI);
 
-  if (RemovableCallsMustBeMarkedTail && !CI->isTailCall())
-    return false;
-
   // Ok, now that we know we have a pseudo-entry block WITH all of the
   // required PHI nodes, add entries into the PHI node for the actual
   // parameters passed into the tail-recursive call.
-  for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i)
-    ArgumentPHIs[i]->addIncoming(CI->getArgOperand(i), BB);
+  for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
+    if (CI->isByValArgument(i))
+      ArgumentPHIs[i]->addIncoming(createTempForByValOperand(CI, i), BB);
+    else
+      ArgumentPHIs[i]->addIncoming(CI->getArgOperand(i), BB);
+  }
 
   if (AccRecInstr) {
     insertAccumulator(AccRecInstr);
@@ -743,8 +765,7 @@
   }
 }
 
-bool TailRecursionEliminator::processBlock(
-    BasicBlock &BB, bool CannotTailCallElimCallsMarkedTail) {
+bool TailRecursionEliminator::processBlock(BasicBlock &BB) {
   Instruction *TI = BB.getTerminator();
 
   if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
@@ -757,7 +778,7 @@
     if (!Ret)
       return false;
 
-    CallInst *CI = findTRECandidate(&BB, CannotTailCallElimCallsMarkedTail);
+    CallInst *CI = findTRECandidate(&BB);
 
     if (!CI)
       return false;
@@ -778,7 +799,7 @@
     eliminateCall(CI);
     return true;
   } else if (isa<ReturnInst>(TI)) {
-    CallInst *CI = findTRECandidate(&BB, CannotTailCallElimCallsMarkedTail);
+    CallInst *CI = findTRECandidate(&BB);
 
     if (CI)
       return eliminateCall(CI);
@@ -796,26 +817,21 @@
     return false;
 
   bool MadeChange = false;
-  bool AllCallsAreTailCalls = false;
-  MadeChange |= markTails(F, AllCallsAreTailCalls, ORE);
-  if (!AllCallsAreTailCalls)
-    return MadeChange;
+  MadeChange |= markTails(F, ORE);
 
   // If this function is a varargs function, we won't be able to PHI the args
   // right, so don't even try to convert it...
   if (F.getFunctionType()->isVarArg())
     return MadeChange;
 
-  // If false, we cannot perform TRE on tail calls marked with the 'tail'
-  // attribute, because doing so would cause the stack size to increase (real
-  // TRE would deallocate variable sized allocas, TRE doesn't).
-  bool CanTRETailMarkedCall = canTRE(F);
+  if (!canTRE(F))
+    return MadeChange;
 
   // Change any tail recursive calls to loops.
   TailRecursionEliminator TRE(F, TTI, AA, ORE, DTU);
 
   for (BasicBlock &BB : F)
-    MadeChange |= TRE.processBlock(BB, !CanTRETailMarkedCall);
+    MadeChange |= TRE.processBlock(BB);
 
   TRE.cleanupAndFinalize();
 
diff --git a/llvm/test/Transforms/TailCallElim/basic.ll b/llvm/test/Transforms/TailCallElim/basic.ll
--- a/llvm/test/Transforms/TailCallElim/basic.ll
+++ b/llvm/test/Transforms/TailCallElim/basic.ll
@@ -12,15 +12,16 @@
 	ret void
 }
 
-; PR615. Make sure that we do not move the alloca so that it interferes with the tail call.
+; Make sure that we do not do TRE if pointer to local stack
+; escapes through function call.
 define i32 @test1() {
 ; CHECK: i32 @test1()
 ; CHECK-NEXT: alloca
 	%A = alloca i32		; <i32*> [#uses=2]
 	store i32 5, i32* %A
 	call void @use(i32* %A)
-; CHECK: tail call i32 @test1
-	%X = tail call i32 @test1()		; <i32> [#uses=1]
+; CHECK: call i32 @test1
+	%X = call i32 @test1()		; <i32> [#uses=1]
 	ret i32 %X
 }
 
diff --git a/llvm/test/Transforms/TailCallElim/tre-byval-parameter-2.ll b/llvm/test/Transforms/TailCallElim/tre-byval-parameter-2.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/TailCallElim/tre-byval-parameter-2.ll
@@ -0,0 +1,155 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -tailcallelim -verify-dom-info -S | FileCheck %s
+
+; the test was generated from the following C++ source:
+;
+; #include <stdio.h>
+; typedef struct A { long long x[10] = {0}; } A;
+; A global;
+; void dostuff(A a, A b, int i) {
+;  if (i==10) return;
+;  a.x[5]++;
+;  printf("%lld %lld\n", a.x[5], b.x[5]); dostuff(b, a, i+1);
+; }
+; __attribute((optnone)) int main() { dostuff(global, global, 0); }
+;
+; This test checks that value for ByValue operands are copied
+; into temporarily variables before function call(as per
+; definition of the byVal operands). Additionally values from
+; these temporarily variables(byval value holders) are copied into
+; another temporarily variables which are passed to the next iteration.
+; That is neccessary since original byval value holders have reduced
+; lifetime scope and could not be used later. Specifically:
+; Value of the B_TR is copied into AGG_TMP, A_TR is copied into
+; AGG_TMP5. AGG_TMP and AGG_TMP5 are marked with lifetime markers.
+; Later values from these byval holders are copied into
+; temporarily variable used on the next iteration of the loop. 
+; AGG_TMP is copied into AGG_TMP1, AGG_TMP5 is copied into
+; AGG_TMP52. An then they are used at next iteration. 
+;
+; [[A_TR:%.*]] = phi %struct.A* [ [[A:%.*]], [[ENTRY:%.*]] ], 
+;                               [ [[AGG_TMP1]], [[IF_END:%.*]] ]
+; [[B_TR:%.*]] = phi %struct.A* [ [[B:%.*]], [[ENTRY]] ],
+;                               [ [[AGG_TMP52]], [[IF_END]] ]
+
+%struct.A = type { [10 x i64] }
+
+@global = dso_local local_unnamed_addr global %struct.A zeroinitializer, align 8
+@.str = private unnamed_addr constant [11 x i8] c"%lld %lld\0A\00", align 1
+
+; Function Attrs: noinline nounwind uwtable
+define dso_local void @_Z7dostuff1AS_i(%struct.A* nocapture byval(%struct.A) align 8 %a, %struct.A* nocapture readonly byval(%struct.A) align 8 %b, i32 %i) local_unnamed_addr #0 {
+; CHECK-LABEL: @_Z7dostuff1AS_i(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AGG_TMP52:%.*]] = alloca [[STRUCT_A:%.*]], align 8
+; CHECK-NEXT:    [[AGG_TMP1:%.*]] = alloca [[STRUCT_A]], align 8
+; CHECK-NEXT:    [[AGG_TMP:%.*]] = alloca [[STRUCT_A]], align 8
+; CHECK-NEXT:    [[AGG_TMP5:%.*]] = alloca [[STRUCT_A]], align 8
+; CHECK-NEXT:    br label [[TAILRECURSE:%.*]]
+; CHECK:       tailrecurse:
+; CHECK-NEXT:    [[A_TR:%.*]] = phi %struct.A* [ [[A:%.*]], [[ENTRY:%.*]] ], [ [[AGG_TMP1]], [[IF_END:%.*]] ]
+; CHECK-NEXT:    [[B_TR:%.*]] = phi %struct.A* [ [[B:%.*]], [[ENTRY]] ], [ [[AGG_TMP52]], [[IF_END]] ]
+; CHECK-NEXT:    [[I_TR:%.*]] = phi i32 [ [[I:%.*]], [[ENTRY]] ], [ [[ADD:%.*]], [[IF_END]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[I_TR]], 10
+; CHECK-NEXT:    br i1 [[CMP]], label [[RETURN:%.*]], label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_A]], %struct.A* [[A_TR]], i64 0, i32 0, i64 5
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, i64* [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[INC:%.*]] = add nsw i64 [[TMP0]], 1
+; CHECK-NEXT:    store i64 [[INC]], i64* [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [[STRUCT_A]], %struct.A* [[B_TR]], i64 0, i32 0, i64 5
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* [[ARRAYIDX4]], align 8
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 (i8*, ...) @printf(i8* nonnull dereferenceable(1) getelementptr inbounds ([11 x i8], [11 x i8]* @.str, i64 0, i64 0), i64 [[INC]], i64 [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.A* [[AGG_TMP]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 80, i8* nonnull [[TMP2]])
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast %struct.A* [[B_TR]] to i8*
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 dereferenceable(80) [[TMP2]], i8* nonnull align 8 dereferenceable(80) [[TMP3]], i64 80, i1 false)
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast %struct.A* [[AGG_TMP5]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 80, i8* nonnull [[TMP4]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast %struct.A* [[A_TR]] to i8*
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 dereferenceable(80) [[TMP4]], i8* nonnull align 8 dereferenceable(80) [[TMP5]], i64 80, i1 false)
+; CHECK-NEXT:    [[ADD]] = add nsw i32 [[I_TR]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast %struct.A* [[AGG_TMP1]] to i8*
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast %struct.A* [[AGG_TMP]] to i8*
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP6]], i8* align 8 [[TMP7]], i64 80, i1 false)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast %struct.A* [[AGG_TMP52]] to i8*
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast %struct.A* [[AGG_TMP5]] to i8*
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP8]], i8* align 8 [[TMP9]], i64 80, i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 80, i8* nonnull [[TMP2]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 80, i8* nonnull [[TMP4]])
+; CHECK-NEXT:    br label [[TAILRECURSE]]
+; CHECK:       return:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %agg.tmp = alloca %struct.A, align 8
+  %agg.tmp5 = alloca %struct.A, align 8
+  %cmp = icmp eq i32 %i, 10
+  br i1 %cmp, label %return, label %if.end
+
+if.end:                                           ; preds = %entry
+  %arrayidx = getelementptr inbounds %struct.A, %struct.A* %a, i64 0, i32 0, i64 5
+  %0 = load i64, i64* %arrayidx, align 8
+  %inc = add nsw i64 %0, 1
+  store i64 %inc, i64* %arrayidx, align 8
+  %arrayidx4 = getelementptr inbounds %struct.A, %struct.A* %b, i64 0, i32 0, i64 5
+  %1 = load i64, i64* %arrayidx4, align 8
+  %call = call i32 (i8*, ...) @printf(i8* nonnull dereferenceable(1) getelementptr inbounds ([11 x i8], [11 x i8]* @.str
+, i64 0, i64 0), i64 %inc, i64 %1)
+  %2 = bitcast %struct.A* %agg.tmp to i8*
+  call void @llvm.lifetime.start.p0i8(i64 80, i8* nonnull %2)
+  %3 = bitcast %struct.A* %b to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 dereferenceable(80) %2, i8* nonnull align 8 dereferenceable(80) %3, i64 80, i1 false)
+  %4 = bitcast %struct.A* %agg.tmp5 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 80, i8* nonnull %4)
+  %5 = bitcast %struct.A* %a to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 dereferenceable(80) %4, i8* nonnull align 8 dereferenceable(80) %5, i64 80, i1 false)
+  %add = add nsw i32 %i, 1
+  call void @_Z7dostuff1AS_i(%struct.A* nonnull byval(%struct.A) align 8 %agg.tmp, %struct.A* nonnull byval(%struct.A) align 8 %agg.tmp5, i32 %add)
+  call void @llvm.lifetime.end.p0i8(i64 80, i8* nonnull %2)
+  call void @llvm.lifetime.end.p0i8(i64 80, i8* nonnull %4)
+  br label %return
+
+return:                                           ; preds = %entry, %if.end
+  ret void
+}
+
+; Function Attrs: nofree nounwind
+declare dso_local noundef i32 @printf(i8* nocapture noundef readonly, ...) local_unnamed_addr #1
+
+; Function Attrs: argmemonly nounwind willreturn
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #2
+
+; Function Attrs: argmemonly nounwind willreturn
+declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #2
+
+; Function Attrs: argmemonly nounwind willreturn
+declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #2
+
+; Function Attrs: noinline norecurse nounwind optnone uwtable
+define dso_local i32 @main() local_unnamed_addr #3 {
+; CHECK-LABEL: @main(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AGG_TMP:%.*]] = alloca [[STRUCT_A:%.*]], align 8
+; CHECK-NEXT:    [[AGG_TMP1:%.*]] = alloca [[STRUCT_A]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.A* [[AGG_TMP]] to i8*
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 bitcast (%struct.A* @global to i8*), i64 80, i1 false)
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.A* [[AGG_TMP1]] to i8*
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP1]], i8* align 8 bitcast (%struct.A* @global to i8*), i64 80, i1 false)
+; CHECK-NEXT:    tail call void @_Z7dostuff1AS_i(%struct.A* byval(%struct.A) align 8 [[AGG_TMP]], %struct.A* byval(%struct.A) align 8 [[AGG_TMP1]], i32 0)
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %agg.tmp = alloca %struct.A, align 8
+  %agg.tmp1 = alloca %struct.A, align 8
+  %0 = bitcast %struct.A* %agg.tmp to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %0, i8* align 8 bitcast (%struct.A* @global to i8*), i64 80, i1 false)
+  %1 = bitcast %struct.A* %agg.tmp1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %1, i8* align 8 bitcast (%struct.A* @global to i8*), i64 80, i1 false)
+  call void @_Z7dostuff1AS_i(%struct.A* byval(%struct.A) align 8 %agg.tmp, %struct.A* byval(%struct.A) align 8 %agg.tmp1, i32 0)
+  ret i32 0
+}
+
+attributes #0 = { uwtable }
+attributes #1 = { uwtable }
+attributes #2 = { argmemonly nounwind willreturn }
diff --git a/llvm/test/Transforms/TailCallElim/tre-byval-parameter.ll b/llvm/test/Transforms/TailCallElim/tre-byval-parameter.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/TailCallElim/tre-byval-parameter.ll
@@ -0,0 +1,123 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -tailcallelim -verify-dom-info -S | FileCheck %s
+
+; the test was generated from the following C++ source:
+;
+; int zoo ( S p1 );
+;
+; int foo ( int count, S p1 ) {
+;   if ( count > 10 )
+;     return zoo(p1);
+;
+;   // After TRE: temporarily variable created for passing byvalue parameter
+;   // p1 could be used when zoo(p1) is called.
+;   return foo(count+1, p1);
+; }
+
+; this test checks that value of temporarily variable AGG_TMP_I
+; (byVal value holder) is copied into another temporarily variable
+; (AGG_TMP_I1). That is neccessary to copy data from variable with
+; reduced scope (lifetime.start/lifetime.end). Specifically when
+; "call i32 @_Z3fooi1S" is replaced with "br label tailrecurse"
+; the value which were copied by "@llvm.memcpy.p0i8.p0i8.i64" into
+; AGG_TMP_I should be later copied into AGG_TMP_I1. Since AGG_TMP_I
+; is marked with lifetime.start/lifetime.end and could not be used
+; later by:
+;
+; "[[P1_TR:%.*]] = phi %struct.S* [ [[P1:%.*]], [[ENTRY]] ],
+;                                   [ [[AGG_TMP_I1]], [[IF_END]] ]".
+
+%struct.S = type { i32, i32, float, %struct.B }
+%struct.B = type { i32, float }
+
+; Function Attrs: uwtable
+define dso_local i32 @_Z3fooi1S(i32 %count, %struct.S* nocapture readonly byval(%struct.S) align 8 %p1) local_unnamed_addr #0 {
+; CHECK-LABEL: @_Z3fooi1S(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AGG_TMP_I1:%.*]] = alloca [[STRUCT_S:%.*]], align 8
+; CHECK-NEXT:    [[AGG_TMP_I:%.*]] = alloca [[STRUCT_S]], align 8
+; CHECK-NEXT:    [[AGG_TMP14:%.*]] = alloca [[STRUCT_S]], align 8
+; CHECK-NEXT:    [[AGG_TMP:%.*]] = alloca [[STRUCT_S]], align 8
+; CHECK-NEXT:    [[AGG_TMP1:%.*]] = alloca [[STRUCT_S]], align 8
+; CHECK-NEXT:    br label [[TAILRECURSE:%.*]]
+; CHECK:       tailrecurse:
+; CHECK-NEXT:    [[COUNT_TR:%.*]] = phi i32 [ [[COUNT:%.*]], [[ENTRY:%.*]] ], [ [[ADD:%.*]], [[IF_END:%.*]] ]
+; CHECK-NEXT:    [[P1_TR:%.*]] = phi %struct.S* [ [[P1:%.*]], [[ENTRY]] ], [ [[AGG_TMP_I1]], [[IF_END]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[COUNT_TR]], 10
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.S* [[AGG_TMP]] to i8*
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.S* [[P1_TR]] to i8*
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 dereferenceable(20) [[TMP0]], i8* nonnull align 8 dereferenceable(20) [[TMP1]], i64 20, i1 false)
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @_Z3zoo1S(%struct.S* nonnull byval(%struct.S) align 8 [[AGG_TMP]])
+; CHECK-NEXT:    br label [[RETURN:%.*]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[ADD]] = add nsw i32 [[COUNT_TR]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.S* [[AGG_TMP1]] to i8*
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast %struct.S* [[P1_TR]] to i8*
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 dereferenceable(20) [[TMP2]], i8* nonnull align 8 dereferenceable(20) [[TMP3]], i64 20, i1 false)
+; CHECK-NEXT:    [[AGG_TMP14_0__SROA_CAST:%.*]] = bitcast %struct.S* [[AGG_TMP14]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 20, i8* nonnull [[AGG_TMP14_0__SROA_CAST]])
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast %struct.S* [[AGG_TMP_I]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 20, i8* nonnull [[TMP4]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 dereferenceable(20) [[AGG_TMP14_0__SROA_CAST]], i8* nonnull align 8 dereferenceable(20) [[TMP2]], i64 20, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 dereferenceable(20) [[TMP4]], i8* nonnull align 8 dereferenceable(20) [[AGG_TMP14_0__SROA_CAST]], i64 20, i1 false)
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast %struct.S* [[AGG_TMP_I1]] to i8*
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast %struct.S* [[AGG_TMP_I]] to i8*
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 20, i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 20, i8* nonnull [[AGG_TMP14_0__SROA_CAST]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 20, i8* nonnull [[TMP4]])
+; CHECK-NEXT:    br label [[TAILRECURSE]]
+; CHECK:       return:
+; CHECK-NEXT:    ret i32 [[CALL]]
+;
+entry:
+  %agg.tmp.i = alloca %struct.S, align 8
+  %agg.tmp14 = alloca %struct.S, align 8
+  %agg.tmp = alloca %struct.S, align 8
+  %agg.tmp1 = alloca %struct.S, align 8
+  %cmp = icmp sgt i32 %count, 10
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %0 = bitcast %struct.S* %agg.tmp to i8*
+  %1 = bitcast %struct.S* %p1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 dereferenceable(20) %0, i8* nonnull align 8 dereferenceable(20) %1, i64 20, i1 false)
+  %call = call i32 @_Z3zoo1S(%struct.S* nonnull byval(%struct.S) align 8 %agg.tmp)
+  br label %return
+
+if.end:                                           ; preds = %entry
+  %add = add nsw i32 %count, 1
+  %2 = bitcast %struct.S* %agg.tmp1 to i8*
+  %3 = bitcast %struct.S* %p1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 dereferenceable(20) %2, i8* nonnull align 8 dereferenceable(20) %3, i64 20, i1 false)
+  %agg.tmp14.0..sroa_cast = bitcast %struct.S* %agg.tmp14 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 20, i8* nonnull %agg.tmp14.0..sroa_cast)
+  %4 = bitcast %struct.S* %agg.tmp.i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 20, i8* nonnull %4)
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 dereferenceable(20) %agg.tmp14.0..sroa_cast, i8* nonnull align 8 dereferenceable(20) %2, i64 20, i1 false)
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 dereferenceable(20) %4, i8* nonnull align 8 dereferenceable(20) %agg.tmp14.0..sroa_cast, i64 20, i1 false)
+  %call.i = call i32 @_Z3fooi1S(i32 %add, %struct.S* nonnull byval(%struct.S) align 8 %agg.tmp.i)
+  call void @llvm.lifetime.end.p0i8(i64 20, i8* nonnull %agg.tmp14.0..sroa_cast)
+  call void @llvm.lifetime.end.p0i8(i64 20, i8* nonnull %4)
+  br label %return
+
+return:                                           ; preds = %if.end, %if.then
+  %retval.0 = phi i32 [ %call, %if.then ], [ %call.i, %if.end ]
+  ret i32 %retval.0
+}
+
+declare dso_local i32 @_Z3zoo1S(%struct.S* byval(%struct.S) align 8) local_unnamed_addr #1
+
+; Function Attrs: argmemonly nounwind willreturn
+declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #2
+
+; Function Attrs: argmemonly nounwind willreturn
+declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #2
+
+; Function Attrs: argmemonly nounwind willreturn
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #2
+
+attributes #0 = { uwtable }
+attributes #1 = { uwtable }
+attributes #2 = { argmemonly nounwind willreturn }
diff --git a/llvm/test/Transforms/TailCallElim/tre-multiple-exits.ll b/llvm/test/Transforms/TailCallElim/tre-multiple-exits.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/TailCallElim/tre-multiple-exits.ll
@@ -0,0 +1,125 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -tailcallelim -verify-dom-info -S | FileCheck %s
+
+; This test checks that TRE would be done for only one recursive call.
+; The test_multiple_exits function has three recursive calls.
+; First recursive call could not be eliminated because there is
+; escaped pointer to local variable. Second recursive call could
+; be eliminated. Thrid recursive call could not be eliminated since
+; this is not last call. Thus, test checks that TRE would be done
+; for only second recursive call.
+
+; IR for that test was generated from the following C++ source:
+;
+; void capture_arg (int*);
+; void test_multiple_exits (int param);
+;   if (param >= 0 && param < 10) {
+;     int temp;
+;     capture_arg(&temp);
+;     // TRE could not be done because pointer to local
+;     // variable "temp" is escaped.
+;     test_multiple_exits(param + 1);
+;   } else if (param >=10 && param < 20) {
+;     // TRE should be done.
+;     test_multiple_exits(param + 1);
+;   } else if (param >= 20 && param < 22) {
+;     // TRE could not be done since recursive
+;     // call is not last call.
+;     test_multiple_exits(param + 1);
+;     func();
+;   }
+;
+;   return;
+; }
+
+; Function Attrs: noinline optnone uwtable
+declare void @_Z11capture_argPi(i32* %param) #0
+
+; Function Attrs: noinline optnone uwtable
+declare void @_Z4funcv() #0
+
+; Function Attrs: noinline nounwind uwtable
+define dso_local void @_Z19test_multiple_exitsi(i32 %param) local_unnamed_addr #2 {
+; CHECK-LABEL: @_Z19test_multiple_exitsi(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TEMP:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    br label [[TAILRECURSE:%.*]]
+; CHECK:       tailrecurse:
+; CHECK-NEXT:    [[PARAM_TR:%.*]] = phi i32 [ [[PARAM:%.*]], [[ENTRY:%.*]] ], [ [[ADD6:%.*]], [[IF_THEN5:%.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[PARAM_TR]], 10
+; CHECK-NEXT:    br i1 [[TMP0]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TEMP]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull [[TMP1]]) #1
+; CHECK-NEXT:    call void @_Z11capture_argPi(i32* nonnull [[TEMP]])
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i32 [[PARAM_TR]], 1
+; CHECK-NEXT:    call void @_Z19test_multiple_exitsi(i32 [[ADD]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull [[TMP1]]) #1
+; CHECK-NEXT:    br label [[IF_END14:%.*]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[PARAM_OFF:%.*]] = add i32 [[PARAM_TR]], -10
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult i32 [[PARAM_OFF]], 10
+; CHECK-NEXT:    br i1 [[TMP2]], label [[IF_THEN5]], label [[IF_ELSE7:%.*]]
+; CHECK:       if.then5:
+; CHECK-NEXT:    [[ADD6]] = add nuw nsw i32 [[PARAM_TR]], 1
+; CHECK-NEXT:    br label [[TAILRECURSE]]
+; CHECK:       if.else7:
+; CHECK-NEXT:    [[TMP3:%.*]] = and i32 [[PARAM_TR]], -2
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[TMP3]], 20
+; CHECK-NEXT:    br i1 [[TMP4]], label [[IF_THEN11:%.*]], label [[IF_END14]]
+; CHECK:       if.then11:
+; CHECK-NEXT:    [[ADD12:%.*]] = add nsw i32 [[PARAM_TR]], 1
+; CHECK-NEXT:    tail call void @_Z19test_multiple_exitsi(i32 [[ADD12]])
+; CHECK-NEXT:    tail call void @_Z4funcv()
+; CHECK-NEXT:    ret void
+; CHECK:       if.end14:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %temp = alloca i32, align 4
+  %0 = icmp ult i32 %param, 10
+  br i1 %0, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %1 = bitcast i32* %temp to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %1) #2
+  call void @_Z11capture_argPi(i32* nonnull %temp)
+  %add = add nuw nsw i32 %param, 1
+  call void @_Z19test_multiple_exitsi(i32 %add)
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %1) #2
+  br label %if.end14
+
+if.else:                                          ; preds = %entry
+  %param.off = add i32 %param, -10
+  %2 = icmp ult i32 %param.off, 10
+  br i1 %2, label %if.then5, label %if.else7
+
+if.then5:                                         ; preds = %if.else
+  %add6 = add nuw nsw i32 %param, 1
+  call void @_Z19test_multiple_exitsi(i32 %add6)
+  br label %if.end14
+
+if.else7:                                         ; preds = %if.else
+  %3 = and i32 %param, -2
+  %4 = icmp eq i32 %3, 20
+  br i1 %4, label %if.then11, label %if.end14
+
+if.then11:                                        ; preds = %if.else7
+  %add12 = add nsw i32 %param, 1
+  call void @_Z19test_multiple_exitsi(i32 %add12)
+  call void @_Z4funcv()
+  br label %if.end14
+
+if.end14:                                         ; preds = %if.then5, %if.then11, %if.else7, %if.then
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind willreturn
+declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #2
+
+; Function Attrs: argmemonly nounwind willreturn
+declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #2
+
+attributes #0 = { nofree noinline norecurse nounwind uwtable }
+attributes #1 = { nounwind uwtable }
+attributes #2 = { argmemonly nounwind willreturn }
diff --git a/llvm/test/Transforms/TailCallElim/tre-noncapturing-alloca-calls.ll b/llvm/test/Transforms/TailCallElim/tre-noncapturing-alloca-calls.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/TailCallElim/tre-noncapturing-alloca-calls.ll
@@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -tailcallelim -verify-dom-info -S | FileCheck %s
+
+; IR for that test was generated from the following C++ source:
+;
+;int count;
+;__attribute__((noinline)) void globalIncrement(const int* param) { count += *param; }
+;
+;void test(int recurseCount)
+;{
+;    if (recurseCount == 0) return;
+;    int temp = 10;
+;    globalIncrement(&temp);
+;    test(recurseCount - 1);
+;}
+;
+
+@count = dso_local local_unnamed_addr global i32 0, align 4
+
+; Function Attrs: nofree noinline norecurse nounwind uwtable
+declare void @_Z15globalIncrementPKi(i32* nocapture readonly %param) #0
+
+; Test that TRE could be done for recursive tail routine containing
+; call to function receiving a pointer to local stack.
+
+; Function Attrs: nounwind uwtable
+define dso_local void @_Z4testi(i32 %recurseCount) local_unnamed_addr #1 {
+; CHECK-LABEL: @_Z4testi(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TEMP:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    br label [[TAILRECURSE:%.*]]
+; CHECK:       tailrecurse:
+; CHECK-NEXT:    [[RECURSECOUNT_TR:%.*]] = phi i32 [ [[RECURSECOUNT:%.*]], [[ENTRY:%.*]] ], [ [[SUB:%.*]], [[IF_END:%.*]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[RECURSECOUNT_TR]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[RETURN:%.*]], label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[TEMP]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull [[TMP0]])
+; CHECK-NEXT:    store i32 10, i32* [[TEMP]], align 4
+; CHECK-NEXT:    call void @_Z15globalIncrementPKi(i32* nonnull [[TEMP]])
+; CHECK-NEXT:    [[SUB]] = add nsw i32 [[RECURSECOUNT_TR]], -1
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull [[TMP0]])
+; CHECK-NEXT:    br label [[TAILRECURSE]]
+; CHECK:       return:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %temp = alloca i32, align 4
+  %cmp = icmp eq i32 %recurseCount, 0
+  br i1 %cmp, label %return, label %if.end
+
+if.end:                                           ; preds = %entry
+  %0 = bitcast i32* %temp to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %0) #6
+  store i32 10, i32* %temp, align 4
+  call void @_Z15globalIncrementPKi(i32* nonnull %temp)
+  %sub = add nsw i32 %recurseCount, -1
+  call void @_Z4testi(i32 %sub)
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %0) #6
+  br label %return
+
+return:                                           ; preds = %entry, %if.end
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind willreturn
+declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #2
+
+; Function Attrs: argmemonly nounwind willreturn
+declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #2
+
+attributes #0 = { nofree noinline norecurse nounwind uwtable }
+attributes #1 = { nounwind uwtable }
+attributes #2 = { argmemonly nounwind willreturn }