diff --git a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp --- a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp +++ b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp @@ -81,6 +81,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" using namespace llvm; #define DEBUG_TYPE "tailcallelim" @@ -92,7 +93,10 @@ /// Scan the specified function for alloca instructions. /// If it contains any dynamic allocas, returns false. static bool canTRE(Function &F) { - // Because of PR962, we don't TRE dynamic allocas. + // TODO: We don't do TRE if dynamic allocas are used. + // Dynamic allocas allocate stack space which should be + // deallocated before new iteration started. That is + // currently not implemented. return llvm::all_of(instructions(F), [](Instruction &I) { auto *AI = dyn_cast(&I); return !AI || AI->isStaticAlloca(); @@ -185,11 +189,9 @@ }; } -static bool markTails(Function &F, bool &AllCallsAreTailCalls, - OptimizationRemarkEmitter *ORE) { +static bool markTails(Function &F, OptimizationRemarkEmitter *ORE) { if (F.callsFunctionThatReturnsTwice()) return false; - AllCallsAreTailCalls = true; // The local stack holds all alloca instructions and all byval arguments. AllocaDerivedValueTracker Tracker; @@ -272,11 +274,8 @@ } } - if (!IsNoTail && Escaped == UNESCAPED && !Tracker.AllocaUsers.count(CI)) { + if (!IsNoTail && Escaped == UNESCAPED && !Tracker.AllocaUsers.count(CI)) DeferredTails.push_back(CI); - } else { - AllCallsAreTailCalls = false; - } } for (auto *SuccBB : make_range(succ_begin(BB), succ_end(BB))) { @@ -313,8 +312,6 @@ LLVM_DEBUG(dbgs() << "Marked as tail call candidate: " << *CI << "\n"); CI->setTailCall(); Modified = true; - } else { - AllCallsAreTailCalls = false; } } @@ -325,7 +322,16 @@ /// instruction from after the call to before the call, assuming that all /// instructions between the call and this instruction are movable. /// -static bool canMoveAboveCall(Instruction *I, CallInst *CI, AliasAnalysis *AA) { +static bool canMoveAboveCall(Instruction *I, CallInst *CI, AliasAnalysis *AA, + DenseMap &AllocaForValue) { + if (isa(I)) + return true; + + if (const IntrinsicInst *II = dyn_cast(I)) + if (II->getIntrinsicID() == Intrinsic::lifetime_end && + llvm::findAllocaForValue(II->getArgOperand(1), AllocaForValue)) + return true; + // FIXME: We can move load/store/call/free instructions above the call if the // call does not mod/ref the memory location being processed. if (I->mayHaveSideEffects()) // This also handles volatile loads. @@ -392,7 +398,6 @@ // createTailRecurseLoopHeader the first time we find a call we can eliminate. BasicBlock *HeaderBB = nullptr; SmallVector ArgumentPHIs; - bool RemovableCallsMustBeMarkedTail = false; // PHI node to store our return value. PHINode *RetPN = nullptr; @@ -414,13 +419,15 @@ // The instruction doing the accumulating. Instruction *AccumulatorRecursionInstr = nullptr; + // The cache for pairs. + DenseMap AllocaForValue; + TailRecursionEliminator(Function &F, const TargetTransformInfo *TTI, AliasAnalysis *AA, OptimizationRemarkEmitter *ORE, DomTreeUpdater &DTU) : F(F), TTI(TTI), AA(AA), ORE(ORE), DTU(DTU) {} - CallInst *findTRECandidate(Instruction *TI, - bool CannotTailCallElimCallsMarkedTail); + CallInst *findTRECandidate(Instruction *TI); void createTailRecurseLoopHeader(CallInst *CI); @@ -428,11 +435,9 @@ bool eliminateCall(CallInst *CI); - bool foldReturnAndProcessPred(ReturnInst *Ret, - bool CannotTailCallElimCallsMarkedTail); + bool foldReturnAndProcessPred(ReturnInst *Ret); - bool processReturningBlock(ReturnInst *Ret, - bool CannotTailCallElimCallsMarkedTail); + bool processReturningBlock(ReturnInst *Ret); void cleanupAndFinalize(); @@ -443,8 +448,7 @@ }; } // namespace -CallInst *TailRecursionEliminator::findTRECandidate( - Instruction *TI, bool CannotTailCallElimCallsMarkedTail) { +CallInst *TailRecursionEliminator::findTRECandidate(Instruction *TI) { BasicBlock *BB = TI->getParent(); if (&BB->front() == TI) // Make sure there is something before the terminator. @@ -464,9 +468,9 @@ --BBI; } - // If this call is marked as a tail call, and if there are dynamic allocas in - // the function, we cannot perform this optimization. - if (CI->isTailCall() && CannotTailCallElimCallsMarkedTail) + assert((!CI->isTailCall() || !CI->isNoTailCall()) && + "Incompatible call site attributes(Tail,NoTail)"); + if (!CI->isTailCall()) return nullptr; // As a special case, detect code like this: @@ -498,26 +502,13 @@ BranchInst *BI = BranchInst::Create(HeaderBB, NewEntry); BI->setDebugLoc(CI->getDebugLoc()); - // If this function has self recursive calls in the tail position where some - // are marked tail and some are not, only transform one flavor or another. - // We have to choose whether we move allocas in the entry block to the new - // entry block or not, so we can't make a good choice for both. We make this - // decision here based on whether the first call we found to remove is - // marked tail. - // NOTE: We could do slightly better here in the case that the function has - // no entry block allocas. - RemovableCallsMustBeMarkedTail = CI->isTailCall(); - - // If this tail call is marked 'tail' and if there are any allocas in the - // entry block, move them up to the new entry block. - if (RemovableCallsMustBeMarkedTail) - // Move all fixed sized allocas from HeaderBB to NewEntry. - for (BasicBlock::iterator OEBI = HeaderBB->begin(), E = HeaderBB->end(), - NEBI = NewEntry->begin(); - OEBI != E;) - if (AllocaInst *AI = dyn_cast(OEBI++)) - if (isa(AI->getArraySize())) - AI->moveBefore(&*NEBI); + // Move all fixed sized allocas from HeaderBB to NewEntry. + for (BasicBlock::iterator OEBI = HeaderBB->begin(), E = HeaderBB->end(), + NEBI = NewEntry->begin(); + OEBI != E;) + if (AllocaInst *AI = dyn_cast(OEBI++)) + if (isa(AI->getArraySize())) + AI->moveBefore(&*NEBI); // Now that we have created a new block, which jumps to the entry // block, insert a PHI node for each argument of the function. @@ -592,7 +583,7 @@ Instruction *AccRecInstr = nullptr; BasicBlock::iterator BBI(CI); for (++BBI; &*BBI != Ret; ++BBI) { - if (canMoveAboveCall(&*BBI, CI, AA)) + if (canMoveAboveCall(&*BBI, CI, AA, AllocaForValue)) continue; // If we can't move the instruction above the call, it might be because it @@ -620,9 +611,6 @@ if (!HeaderBB) createTailRecurseLoopHeader(CI); - if (RemovableCallsMustBeMarkedTail && !CI->isTailCall()) - return false; - // Ok, now that we know we have a pseudo-entry block WITH all of the // required PHI nodes, add entries into the PHI node for the actual // parameters passed into the tail-recursive call. @@ -672,8 +660,7 @@ return true; } -bool TailRecursionEliminator::foldReturnAndProcessPred( - ReturnInst *Ret, bool CannotTailCallElimCallsMarkedTail) { +bool TailRecursionEliminator::foldReturnAndProcessPred(ReturnInst *Ret) { BasicBlock *BB = Ret->getParent(); bool Change = false; @@ -698,8 +685,7 @@ while (!UncondBranchPreds.empty()) { BranchInst *BI = UncondBranchPreds.pop_back_val(); BasicBlock *Pred = BI->getParent(); - if (CallInst *CI = - findTRECandidate(BI, CannotTailCallElimCallsMarkedTail)) { + if (CallInst *CI = findTRECandidate(BI)) { LLVM_DEBUG(dbgs() << "FOLDING: " << *BB << "INTO UNCOND BRANCH PRED: " << *Pred); FoldReturnIntoUncondBranch(Ret, BB, Pred, &DTU); @@ -720,9 +706,8 @@ return Change; } -bool TailRecursionEliminator::processReturningBlock( - ReturnInst *Ret, bool CannotTailCallElimCallsMarkedTail) { - CallInst *CI = findTRECandidate(Ret, CannotTailCallElimCallsMarkedTail); +bool TailRecursionEliminator::processReturningBlock(ReturnInst *Ret) { + CallInst *CI = findTRECandidate(Ret); if (!CI) return false; @@ -810,35 +795,25 @@ return false; bool MadeChange = false; - bool AllCallsAreTailCalls = false; - MadeChange |= markTails(F, AllCallsAreTailCalls, ORE); - if (!AllCallsAreTailCalls) - return MadeChange; + MadeChange |= markTails(F, ORE); // If this function is a varargs function, we won't be able to PHI the args // right, so don't even try to convert it... if (F.getFunctionType()->isVarArg()) return MadeChange; - // If false, we cannot perform TRE on tail calls marked with the 'tail' - // attribute, because doing so would cause the stack size to increase (real - // TRE would deallocate variable sized allocas, TRE doesn't). - bool CanTRETailMarkedCall = canTRE(F); + if (!canTRE(F)) + return MadeChange; TailRecursionEliminator TRE(F, TTI, AA, ORE, DTU); // Change any tail recursive calls to loops. - // - // FIXME: The code generator produces really bad code when an 'escaping - // alloca' is changed from being a static alloca to being a dynamic alloca. - // Until this is resolved, disable this transformation if that would ever - // happen. This bug is PR962. for (Function::iterator BBI = F.begin(), E = F.end(); BBI != E; /*in loop*/) { BasicBlock *BB = &*BBI++; // foldReturnAndProcessPred may delete BB. if (ReturnInst *Ret = dyn_cast(BB->getTerminator())) { - bool Change = TRE.processReturningBlock(Ret, !CanTRETailMarkedCall); + bool Change = TRE.processReturningBlock(Ret); if (!Change && BB->getFirstNonPHIOrDbg() == Ret) - Change = TRE.foldReturnAndProcessPred(Ret, !CanTRETailMarkedCall); + Change = TRE.foldReturnAndProcessPred(Ret); MadeChange |= Change; } } diff --git a/llvm/test/Transforms/TailCallElim/basic.ll b/llvm/test/Transforms/TailCallElim/basic.ll --- a/llvm/test/Transforms/TailCallElim/basic.ll +++ b/llvm/test/Transforms/TailCallElim/basic.ll @@ -12,15 +12,16 @@ ret void } -; PR615. Make sure that we do not move the alloca so that it interferes with the tail call. +; Make sure that we do not do TRE if pointer to local stack +; escapes through function call. define i32 @test1() { ; CHECK: i32 @test1() ; CHECK-NEXT: alloca %A = alloca i32 ; [#uses=2] store i32 5, i32* %A call void @use(i32* %A) -; CHECK: tail call i32 @test1 - %X = tail call i32 @test1() ; [#uses=1] +; CHECK: call i32 @test1 + %X = call i32 @test1() ; [#uses=1] ret i32 %X } diff --git a/llvm/test/Transforms/TailCallElim/tre-multiple-exits.ll b/llvm/test/Transforms/TailCallElim/tre-multiple-exits.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/TailCallElim/tre-multiple-exits.ll @@ -0,0 +1,125 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -tailcallelim -verify-dom-info -S | FileCheck %s + +; This test checks that TRE would be done for only one recursive call. +; The test_multiple_exits function has three recursive calls. +; First recursive call could not be eliminated because there is +; escaped pointer to local variable. Second recursive call could +; be eliminated. Thrid recursive call could not be eliminated since +; this is not last call. Thus, test checks that TRE would be done +; for only second recursive call. + +; IR for that test was generated from the following C++ source: +; +; void capture_arg (int*); +; void test_multiple_exits (int param); +; if (param >= 0 && param < 10) { +; int temp; +; capture_arg(&temp); +; // TRE could not be done because pointer to local +; // variable "temp" is escaped. +; test_multiple_exits(param + 1); +; } else if (param >=10 && param < 20) { +; // TRE should be done. +; test_multiple_exits(param + 1); +; } else if (param >= 20 && param < 22) { +; // TRE could not be done since recursive +; // call is not last call. +; test_multiple_exits(param + 1); +; func(); +; } +; +; return; +; } + +; Function Attrs: noinline optnone uwtable +declare void @_Z11capture_argPi(i32* %param) #0 + +; Function Attrs: noinline optnone uwtable +declare void @_Z4funcv() #0 + +; Function Attrs: noinline nounwind uwtable +define dso_local void @_Z19test_multiple_exitsi(i32 %param) local_unnamed_addr #2 { +; CHECK-LABEL: @_Z19test_multiple_exitsi( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TEMP:%.*]] = alloca i32, align 4 +; CHECK-NEXT: br label [[TAILRECURSE:%.*]] +; CHECK: tailrecurse: +; CHECK-NEXT: [[PARAM_TR:%.*]] = phi i32 [ [[PARAM:%.*]], [[ENTRY:%.*]] ], [ [[ADD6:%.*]], [[IF_THEN5:%.*]] ] +; CHECK-NEXT: [[TMP0:%.*]] = icmp ult i32 [[PARAM_TR]], 10 +; CHECK-NEXT: br i1 [[TMP0]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TEMP]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull [[TMP1]]) #1 +; CHECK-NEXT: call void @_Z11capture_argPi(i32* nonnull [[TEMP]]) +; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[PARAM_TR]], 1 +; CHECK-NEXT: call void @_Z19test_multiple_exitsi(i32 [[ADD]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull [[TMP1]]) #1 +; CHECK-NEXT: br label [[IF_END14:%.*]] +; CHECK: if.else: +; CHECK-NEXT: [[PARAM_OFF:%.*]] = add i32 [[PARAM_TR]], -10 +; CHECK-NEXT: [[TMP2:%.*]] = icmp ult i32 [[PARAM_OFF]], 10 +; CHECK-NEXT: br i1 [[TMP2]], label [[IF_THEN5]], label [[IF_ELSE7:%.*]] +; CHECK: if.then5: +; CHECK-NEXT: [[ADD6]] = add nuw nsw i32 [[PARAM_TR]], 1 +; CHECK-NEXT: br label [[TAILRECURSE]] +; CHECK: if.else7: +; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[PARAM_TR]], -2 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[TMP3]], 20 +; CHECK-NEXT: br i1 [[TMP4]], label [[IF_THEN11:%.*]], label [[IF_END14]] +; CHECK: if.then11: +; CHECK-NEXT: [[ADD12:%.*]] = add nsw i32 [[PARAM_TR]], 1 +; CHECK-NEXT: tail call void @_Z19test_multiple_exitsi(i32 [[ADD12]]) +; CHECK-NEXT: tail call void @_Z4funcv() +; CHECK-NEXT: ret void +; CHECK: if.end14: +; CHECK-NEXT: ret void +; +entry: + %temp = alloca i32, align 4 + %0 = icmp ult i32 %param, 10 + br i1 %0, label %if.then, label %if.else + +if.then: ; preds = %entry + %1 = bitcast i32* %temp to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %1) #2 + call void @_Z11capture_argPi(i32* nonnull %temp) + %add = add nuw nsw i32 %param, 1 + call void @_Z19test_multiple_exitsi(i32 %add) + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %1) #2 + br label %if.end14 + +if.else: ; preds = %entry + %param.off = add i32 %param, -10 + %2 = icmp ult i32 %param.off, 10 + br i1 %2, label %if.then5, label %if.else7 + +if.then5: ; preds = %if.else + %add6 = add nuw nsw i32 %param, 1 + call void @_Z19test_multiple_exitsi(i32 %add6) + br label %if.end14 + +if.else7: ; preds = %if.else + %3 = and i32 %param, -2 + %4 = icmp eq i32 %3, 20 + br i1 %4, label %if.then11, label %if.end14 + +if.then11: ; preds = %if.else7 + %add12 = add nsw i32 %param, 1 + call void @_Z19test_multiple_exitsi(i32 %add12) + call void @_Z4funcv() + br label %if.end14 + +if.end14: ; preds = %if.then5, %if.then11, %if.else7, %if.then + ret void +} + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #2 + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #2 + +attributes #0 = { nofree noinline norecurse nounwind uwtable } +attributes #1 = { nounwind uwtable } +attributes #2 = { argmemonly nounwind willreturn } diff --git a/llvm/test/Transforms/TailCallElim/tre-noncapturing-alloca-calls.ll b/llvm/test/Transforms/TailCallElim/tre-noncapturing-alloca-calls.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/TailCallElim/tre-noncapturing-alloca-calls.ll @@ -0,0 +1,74 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -tailcallelim -verify-dom-info -S | FileCheck %s + +; IR for that test was generated from the following C++ source: +; +;int count; +;__attribute__((noinline)) void globalIncrement(const int* param) { count += *param; } +; +;void test(int recurseCount) +;{ +; if (recurseCount == 0) return; +; int temp = 10; +; globalIncrement(&temp); +; test(recurseCount - 1); +;} +; + +@count = dso_local local_unnamed_addr global i32 0, align 4 + +; Function Attrs: nofree noinline norecurse nounwind uwtable +declare void @_Z15globalIncrementPKi(i32* nocapture readonly %param) #0 + +; Test that TRE could be done for recursive tail routine containing +; call to function receiving a pointer to local stack. + +; Function Attrs: nounwind uwtable +define dso_local void @_Z4testi(i32 %recurseCount) local_unnamed_addr #1 { +; CHECK-LABEL: @_Z4testi( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TEMP:%.*]] = alloca i32, align 4 +; CHECK-NEXT: br label [[TAILRECURSE:%.*]] +; CHECK: tailrecurse: +; CHECK-NEXT: [[RECURSECOUNT_TR:%.*]] = phi i32 [ [[RECURSECOUNT:%.*]], [[ENTRY:%.*]] ], [ [[SUB:%.*]], [[IF_END:%.*]] ] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[RECURSECOUNT_TR]], 0 +; CHECK-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[IF_END]] +; CHECK: if.end: +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[TEMP]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull [[TMP0]]) +; CHECK-NEXT: store i32 10, i32* [[TEMP]], align 4 +; CHECK-NEXT: call void @_Z15globalIncrementPKi(i32* nonnull [[TEMP]]) +; CHECK-NEXT: [[SUB]] = add nsw i32 [[RECURSECOUNT_TR]], -1 +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull [[TMP0]]) +; CHECK-NEXT: br label [[TAILRECURSE]] +; CHECK: return: +; CHECK-NEXT: ret void +; +entry: + %temp = alloca i32, align 4 + %cmp = icmp eq i32 %recurseCount, 0 + br i1 %cmp, label %return, label %if.end + +if.end: ; preds = %entry + %0 = bitcast i32* %temp to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %0) #6 + store i32 10, i32* %temp, align 4 + call void @_Z15globalIncrementPKi(i32* nonnull %temp) + %sub = add nsw i32 %recurseCount, -1 + call void @_Z4testi(i32 %sub) + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %0) #6 + br label %return + +return: ; preds = %entry, %if.end + ret void +} + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #2 + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #2 + +attributes #0 = { nofree noinline norecurse nounwind uwtable } +attributes #1 = { nounwind uwtable } +attributes #2 = { argmemonly nounwind willreturn }