diff --git a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp --- a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp +++ b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp @@ -63,6 +63,7 @@ #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/PostDominators.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" @@ -70,6 +71,7 @@ #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" @@ -81,6 +83,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" using namespace llvm; #define DEBUG_TYPE "tailcallelim" @@ -92,10 +95,10 @@ /// Scan the specified function for alloca instructions. /// If it contains any dynamic allocas, returns false. static bool canTRE(Function &F) { - // FIXME: The code generator produces really bad code when an 'escaping - // alloca' is changed from being a static alloca to being a dynamic alloca. - // Until this is resolved, disable this transformation if that would ever - // happen. This bug is PR962. + // TODO: We don't do TRE if dynamic allocas are used. + // Dynamic allocas allocate stack space which should be + // deallocated before new iteration started. That is + // currently not implemented. return llvm::all_of(instructions(F), [](Instruction &I) { auto *AI = dyn_cast(&I); return !AI || AI->isStaticAlloca(); @@ -188,11 +191,9 @@ }; } -static bool markTails(Function &F, bool &AllCallsAreTailCalls, - OptimizationRemarkEmitter *ORE) { +static bool markTails(Function &F, OptimizationRemarkEmitter *ORE) { if (F.callsFunctionThatReturnsTwice()) return false; - AllCallsAreTailCalls = true; // The local stack holds all alloca instructions and all byval arguments. AllocaDerivedValueTracker Tracker; @@ -275,11 +276,8 @@ } } - if (!IsNoTail && Escaped == UNESCAPED && !Tracker.AllocaUsers.count(CI)) { + if (!IsNoTail && Escaped == UNESCAPED && !Tracker.AllocaUsers.count(CI)) DeferredTails.push_back(CI); - } else { - AllCallsAreTailCalls = false; - } } for (auto *SuccBB : make_range(succ_begin(BB), succ_end(BB))) { @@ -316,8 +314,6 @@ LLVM_DEBUG(dbgs() << "Marked as tail call candidate: " << *CI << "\n"); CI->setTailCall(); Modified = true; - } else { - AllCallsAreTailCalls = false; } } @@ -329,6 +325,14 @@ /// instructions between the call and this instruction are movable. /// static bool canMoveAboveCall(Instruction *I, CallInst *CI, AliasAnalysis *AA) { + if (isa(I)) + return true; + + if (const IntrinsicInst *II = dyn_cast(I)) + if (II->getIntrinsicID() == Intrinsic::lifetime_end && + llvm::findAllocaForValue(II->getArgOperand(1))) + return true; + // FIXME: We can move load/store/call/free instructions above the call if the // call does not mod/ref the memory location being processed. if (I->mayHaveSideEffects()) // This also handles volatile loads. @@ -395,7 +399,6 @@ // createTailRecurseLoopHeader the first time we find a call we can eliminate. BasicBlock *HeaderBB = nullptr; SmallVector ArgumentPHIs; - bool RemovableCallsMustBeMarkedTail = false; // PHI node to store our return value. PHINode *RetPN = nullptr; @@ -422,8 +425,7 @@ DomTreeUpdater &DTU) : F(F), TTI(TTI), AA(AA), ORE(ORE), DTU(DTU) {} - CallInst *findTRECandidate(BasicBlock *BB, - bool CannotTailCallElimCallsMarkedTail); + CallInst *findTRECandidate(BasicBlock *BB); void createTailRecurseLoopHeader(CallInst *CI); @@ -433,7 +435,9 @@ void cleanupAndFinalize(); - bool processBlock(BasicBlock &BB, bool CannotTailCallElimCallsMarkedTail); + bool processBlock(BasicBlock &BB); + + Value *createTempForByValOperand(CallInst *CI, int OpndIdx); public: static bool eliminate(Function &F, const TargetTransformInfo *TTI, @@ -442,8 +446,7 @@ }; } // namespace -CallInst *TailRecursionEliminator::findTRECandidate( - BasicBlock *BB, bool CannotTailCallElimCallsMarkedTail) { +CallInst *TailRecursionEliminator::findTRECandidate(BasicBlock *BB) { Instruction *TI = BB->getTerminator(); if (&BB->front() == TI) // Make sure there is something before the terminator. @@ -463,9 +466,9 @@ --BBI; } - // If this call is marked as a tail call, and if there are dynamic allocas in - // the function, we cannot perform this optimization. - if (CI->isTailCall() && CannotTailCallElimCallsMarkedTail) + assert((!CI->isTailCall() || !CI->isNoTailCall()) && + "Incompatible call site attributes(Tail,NoTail)"); + if (!CI->isTailCall()) return nullptr; // As a special case, detect code like this: @@ -497,26 +500,13 @@ BranchInst *BI = BranchInst::Create(HeaderBB, NewEntry); BI->setDebugLoc(CI->getDebugLoc()); - // If this function has self recursive calls in the tail position where some - // are marked tail and some are not, only transform one flavor or another. - // We have to choose whether we move allocas in the entry block to the new - // entry block or not, so we can't make a good choice for both. We make this - // decision here based on whether the first call we found to remove is - // marked tail. - // NOTE: We could do slightly better here in the case that the function has - // no entry block allocas. - RemovableCallsMustBeMarkedTail = CI->isTailCall(); - - // If this tail call is marked 'tail' and if there are any allocas in the - // entry block, move them up to the new entry block. - if (RemovableCallsMustBeMarkedTail) - // Move all fixed sized allocas from HeaderBB to NewEntry. - for (BasicBlock::iterator OEBI = HeaderBB->begin(), E = HeaderBB->end(), - NEBI = NewEntry->begin(); - OEBI != E;) - if (AllocaInst *AI = dyn_cast(OEBI++)) - if (isa(AI->getArraySize())) - AI->moveBefore(&*NEBI); + // Move all fixed sized allocas from HeaderBB to NewEntry. + for (BasicBlock::iterator OEBI = HeaderBB->begin(), E = HeaderBB->end(), + NEBI = NewEntry->begin(); + OEBI != E;) + if (AllocaInst *AI = dyn_cast(OEBI++)) + if (isa(AI->getArraySize())) + AI->moveBefore(&*NEBI); // Now that we have created a new block, which jumps to the entry // block, insert a PHI node for each argument of the function. @@ -581,6 +571,37 @@ ++NumAccumAdded; } +Value *TailRecursionEliminator::createTempForByValOperand(CallInst *CI, + int OpndIdx) { + PointerType *ArgTy = cast(CI->getArgOperand(OpndIdx)->getType()); + Type *AggTy = ArgTy->getElementType(); + const DataLayout &DL = F.getParent()->getDataLayout(); + + // Calculate alignment of byVal operand. + Align Alignment(DL.getPrefTypeAlignment(AggTy)); + + // If the byval had an alignment specified, we *must* use at least that + // alignment, as it is required by the byval argument (and uses of the + // pointer inside the callee). + Alignment = max(Alignment, MaybeAlign(CI->getParamAlign(OpndIdx))); + + // Create alloca for temporarily byval operands. + // Put alloca into the entry block. + Value *NewAlloca = new AllocaInst( + AggTy, DL.getAllocaAddrSpace(), nullptr, Alignment, + CI->getArgOperand(OpndIdx)->getName(), &*F.getEntryBlock().begin()); + + IRBuilder<> Builder(CI); + Value *Size = Builder.getInt64(DL.getTypeAllocSize(AggTy)); + + // Copy data from byvalue operand into the temporarily variable. + Builder.CreateMemCpy(NewAlloca, /*DstAlign*/ Alignment, + CI->getArgOperand(OpndIdx), + /*SrcAlign*/ Alignment, Size); + + return NewAlloca; +} + bool TailRecursionEliminator::eliminateCall(CallInst *CI) { ReturnInst *Ret = cast(CI->getParent()->getTerminator()); @@ -619,14 +640,15 @@ if (!HeaderBB) createTailRecurseLoopHeader(CI); - if (RemovableCallsMustBeMarkedTail && !CI->isTailCall()) - return false; - // Ok, now that we know we have a pseudo-entry block WITH all of the // required PHI nodes, add entries into the PHI node for the actual // parameters passed into the tail-recursive call. - for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) - ArgumentPHIs[i]->addIncoming(CI->getArgOperand(i), BB); + for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) { + if (CI->isByValArgument(i)) + ArgumentPHIs[i]->addIncoming(createTempForByValOperand(CI, i), BB); + else + ArgumentPHIs[i]->addIncoming(CI->getArgOperand(i), BB); + } if (AccRecInstr) { insertAccumulator(AccRecInstr); @@ -743,8 +765,7 @@ } } -bool TailRecursionEliminator::processBlock( - BasicBlock &BB, bool CannotTailCallElimCallsMarkedTail) { +bool TailRecursionEliminator::processBlock(BasicBlock &BB) { Instruction *TI = BB.getTerminator(); if (BranchInst *BI = dyn_cast(TI)) { @@ -757,7 +778,7 @@ if (!Ret) return false; - CallInst *CI = findTRECandidate(&BB, CannotTailCallElimCallsMarkedTail); + CallInst *CI = findTRECandidate(&BB); if (!CI) return false; @@ -778,7 +799,7 @@ eliminateCall(CI); return true; } else if (isa(TI)) { - CallInst *CI = findTRECandidate(&BB, CannotTailCallElimCallsMarkedTail); + CallInst *CI = findTRECandidate(&BB); if (CI) return eliminateCall(CI); @@ -796,26 +817,21 @@ return false; bool MadeChange = false; - bool AllCallsAreTailCalls = false; - MadeChange |= markTails(F, AllCallsAreTailCalls, ORE); - if (!AllCallsAreTailCalls) - return MadeChange; + MadeChange |= markTails(F, ORE); // If this function is a varargs function, we won't be able to PHI the args // right, so don't even try to convert it... if (F.getFunctionType()->isVarArg()) return MadeChange; - // If false, we cannot perform TRE on tail calls marked with the 'tail' - // attribute, because doing so would cause the stack size to increase (real - // TRE would deallocate variable sized allocas, TRE doesn't). - bool CanTRETailMarkedCall = canTRE(F); + if (!canTRE(F)) + return MadeChange; // Change any tail recursive calls to loops. TailRecursionEliminator TRE(F, TTI, AA, ORE, DTU); for (BasicBlock &BB : F) - MadeChange |= TRE.processBlock(BB, !CanTRETailMarkedCall); + MadeChange |= TRE.processBlock(BB); TRE.cleanupAndFinalize(); diff --git a/llvm/test/Transforms/TailCallElim/basic.ll b/llvm/test/Transforms/TailCallElim/basic.ll --- a/llvm/test/Transforms/TailCallElim/basic.ll +++ b/llvm/test/Transforms/TailCallElim/basic.ll @@ -12,15 +12,16 @@ ret void } -; PR615. Make sure that we do not move the alloca so that it interferes with the tail call. +; Make sure that we do not do TRE if pointer to local stack +; escapes through function call. define i32 @test1() { ; CHECK: i32 @test1() ; CHECK-NEXT: alloca %A = alloca i32 ; [#uses=2] store i32 5, i32* %A call void @use(i32* %A) -; CHECK: tail call i32 @test1 - %X = tail call i32 @test1() ; [#uses=1] +; CHECK: call i32 @test1 + %X = call i32 @test1() ; [#uses=1] ret i32 %X } diff --git a/llvm/test/Transforms/TailCallElim/tre-byval-parameter-2.ll b/llvm/test/Transforms/TailCallElim/tre-byval-parameter-2.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/TailCallElim/tre-byval-parameter-2.ll @@ -0,0 +1,155 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -tailcallelim -verify-dom-info -S | FileCheck %s + +; the test was generated from the following C++ source: +; +; #include +; typedef struct A { long long x[10] = {0}; } A; +; A global; +; void dostuff(A a, A b, int i) { +; if (i==10) return; +; a.x[5]++; +; printf("%lld %lld\n", a.x[5], b.x[5]); dostuff(b, a, i+1); +; } +; __attribute((optnone)) int main() { dostuff(global, global, 0); } +; +; This test checks that value for ByValue operands are copied +; into temporarily variables before function call(as per +; definition of the byVal operands). Additionally values from +; these temporarily variables(byval value holders) are copied into +; another temporarily variables which are passed to the next iteration. +; That is neccessary since original byval value holders have reduced +; lifetime scope and could not be used later. Specifically: +; Value of the B_TR is copied into AGG_TMP, A_TR is copied into +; AGG_TMP5. AGG_TMP and AGG_TMP5 are marked with lifetime markers. +; Later values from these byval holders are copied into +; temporarily variable used on the next iteration of the loop. +; AGG_TMP is copied into AGG_TMP1, AGG_TMP5 is copied into +; AGG_TMP52. An then they are used at next iteration. +; +; [[A_TR:%.*]] = phi %struct.A* [ [[A:%.*]], [[ENTRY:%.*]] ], +; [ [[AGG_TMP1]], [[IF_END:%.*]] ] +; [[B_TR:%.*]] = phi %struct.A* [ [[B:%.*]], [[ENTRY]] ], +; [ [[AGG_TMP52]], [[IF_END]] ] + +%struct.A = type { [10 x i64] } + +@global = dso_local local_unnamed_addr global %struct.A zeroinitializer, align 8 +@.str = private unnamed_addr constant [11 x i8] c"%lld %lld\0A\00", align 1 + +; Function Attrs: noinline nounwind uwtable +define dso_local void @_Z7dostuff1AS_i(%struct.A* nocapture byval(%struct.A) align 8 %a, %struct.A* nocapture readonly byval(%struct.A) align 8 %b, i32 %i) local_unnamed_addr #0 { +; CHECK-LABEL: @_Z7dostuff1AS_i( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[AGG_TMP52:%.*]] = alloca [[STRUCT_A:%.*]], align 8 +; CHECK-NEXT: [[AGG_TMP1:%.*]] = alloca [[STRUCT_A]], align 8 +; CHECK-NEXT: [[AGG_TMP:%.*]] = alloca [[STRUCT_A]], align 8 +; CHECK-NEXT: [[AGG_TMP5:%.*]] = alloca [[STRUCT_A]], align 8 +; CHECK-NEXT: br label [[TAILRECURSE:%.*]] +; CHECK: tailrecurse: +; CHECK-NEXT: [[A_TR:%.*]] = phi %struct.A* [ [[A:%.*]], [[ENTRY:%.*]] ], [ [[AGG_TMP1]], [[IF_END:%.*]] ] +; CHECK-NEXT: [[B_TR:%.*]] = phi %struct.A* [ [[B:%.*]], [[ENTRY]] ], [ [[AGG_TMP52]], [[IF_END]] ] +; CHECK-NEXT: [[I_TR:%.*]] = phi i32 [ [[I:%.*]], [[ENTRY]] ], [ [[ADD:%.*]], [[IF_END]] ] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[I_TR]], 10 +; CHECK-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[IF_END]] +; CHECK: if.end: +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_A]], %struct.A* [[A_TR]], i64 0, i32 0, i64 5 +; CHECK-NEXT: [[TMP0:%.*]] = load i64, i64* [[ARRAYIDX]], align 8 +; CHECK-NEXT: [[INC:%.*]] = add nsw i64 [[TMP0]], 1 +; CHECK-NEXT: store i64 [[INC]], i64* [[ARRAYIDX]], align 8 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [[STRUCT_A]], %struct.A* [[B_TR]], i64 0, i32 0, i64 5 +; CHECK-NEXT: [[TMP1:%.*]] = load i64, i64* [[ARRAYIDX4]], align 8 +; CHECK-NEXT: [[CALL:%.*]] = tail call i32 (i8*, ...) @printf(i8* nonnull dereferenceable(1) getelementptr inbounds ([11 x i8], [11 x i8]* @.str, i64 0, i64 0), i64 [[INC]], i64 [[TMP1]]) +; CHECK-NEXT: [[TMP2:%.*]] = bitcast %struct.A* [[AGG_TMP]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 80, i8* nonnull [[TMP2]]) +; CHECK-NEXT: [[TMP3:%.*]] = bitcast %struct.A* [[B_TR]] to i8* +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 dereferenceable(80) [[TMP2]], i8* nonnull align 8 dereferenceable(80) [[TMP3]], i64 80, i1 false) +; CHECK-NEXT: [[TMP4:%.*]] = bitcast %struct.A* [[AGG_TMP5]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 80, i8* nonnull [[TMP4]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast %struct.A* [[A_TR]] to i8* +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 dereferenceable(80) [[TMP4]], i8* nonnull align 8 dereferenceable(80) [[TMP5]], i64 80, i1 false) +; CHECK-NEXT: [[ADD]] = add nsw i32 [[I_TR]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast %struct.A* [[AGG_TMP1]] to i8* +; CHECK-NEXT: [[TMP7:%.*]] = bitcast %struct.A* [[AGG_TMP]] to i8* +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP6]], i8* align 8 [[TMP7]], i64 80, i1 false) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast %struct.A* [[AGG_TMP52]] to i8* +; CHECK-NEXT: [[TMP9:%.*]] = bitcast %struct.A* [[AGG_TMP5]] to i8* +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP8]], i8* align 8 [[TMP9]], i64 80, i1 false) +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 80, i8* nonnull [[TMP2]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 80, i8* nonnull [[TMP4]]) +; CHECK-NEXT: br label [[TAILRECURSE]] +; CHECK: return: +; CHECK-NEXT: ret void +; +entry: + %agg.tmp = alloca %struct.A, align 8 + %agg.tmp5 = alloca %struct.A, align 8 + %cmp = icmp eq i32 %i, 10 + br i1 %cmp, label %return, label %if.end + +if.end: ; preds = %entry + %arrayidx = getelementptr inbounds %struct.A, %struct.A* %a, i64 0, i32 0, i64 5 + %0 = load i64, i64* %arrayidx, align 8 + %inc = add nsw i64 %0, 1 + store i64 %inc, i64* %arrayidx, align 8 + %arrayidx4 = getelementptr inbounds %struct.A, %struct.A* %b, i64 0, i32 0, i64 5 + %1 = load i64, i64* %arrayidx4, align 8 + %call = call i32 (i8*, ...) @printf(i8* nonnull dereferenceable(1) getelementptr inbounds ([11 x i8], [11 x i8]* @.str +, i64 0, i64 0), i64 %inc, i64 %1) + %2 = bitcast %struct.A* %agg.tmp to i8* + call void @llvm.lifetime.start.p0i8(i64 80, i8* nonnull %2) + %3 = bitcast %struct.A* %b to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 dereferenceable(80) %2, i8* nonnull align 8 dereferenceable(80) %3, i64 80, i1 false) + %4 = bitcast %struct.A* %agg.tmp5 to i8* + call void @llvm.lifetime.start.p0i8(i64 80, i8* nonnull %4) + %5 = bitcast %struct.A* %a to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 dereferenceable(80) %4, i8* nonnull align 8 dereferenceable(80) %5, i64 80, i1 false) + %add = add nsw i32 %i, 1 + call void @_Z7dostuff1AS_i(%struct.A* nonnull byval(%struct.A) align 8 %agg.tmp, %struct.A* nonnull byval(%struct.A) align 8 %agg.tmp5, i32 %add) + call void @llvm.lifetime.end.p0i8(i64 80, i8* nonnull %2) + call void @llvm.lifetime.end.p0i8(i64 80, i8* nonnull %4) + br label %return + +return: ; preds = %entry, %if.end + ret void +} + +; Function Attrs: nofree nounwind +declare dso_local noundef i32 @printf(i8* nocapture noundef readonly, ...) local_unnamed_addr #1 + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #2 + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #2 + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #2 + +; Function Attrs: noinline norecurse nounwind optnone uwtable +define dso_local i32 @main() local_unnamed_addr #3 { +; CHECK-LABEL: @main( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[AGG_TMP:%.*]] = alloca [[STRUCT_A:%.*]], align 8 +; CHECK-NEXT: [[AGG_TMP1:%.*]] = alloca [[STRUCT_A]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast %struct.A* [[AGG_TMP]] to i8* +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 bitcast (%struct.A* @global to i8*), i64 80, i1 false) +; CHECK-NEXT: [[TMP1:%.*]] = bitcast %struct.A* [[AGG_TMP1]] to i8* +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP1]], i8* align 8 bitcast (%struct.A* @global to i8*), i64 80, i1 false) +; CHECK-NEXT: tail call void @_Z7dostuff1AS_i(%struct.A* byval(%struct.A) align 8 [[AGG_TMP]], %struct.A* byval(%struct.A) align 8 [[AGG_TMP1]], i32 0) +; CHECK-NEXT: ret i32 0 +; +entry: + %agg.tmp = alloca %struct.A, align 8 + %agg.tmp1 = alloca %struct.A, align 8 + %0 = bitcast %struct.A* %agg.tmp to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %0, i8* align 8 bitcast (%struct.A* @global to i8*), i64 80, i1 false) + %1 = bitcast %struct.A* %agg.tmp1 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %1, i8* align 8 bitcast (%struct.A* @global to i8*), i64 80, i1 false) + call void @_Z7dostuff1AS_i(%struct.A* byval(%struct.A) align 8 %agg.tmp, %struct.A* byval(%struct.A) align 8 %agg.tmp1, i32 0) + ret i32 0 +} + +attributes #0 = { uwtable } +attributes #1 = { uwtable } +attributes #2 = { argmemonly nounwind willreturn } diff --git a/llvm/test/Transforms/TailCallElim/tre-byval-parameter.ll b/llvm/test/Transforms/TailCallElim/tre-byval-parameter.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/TailCallElim/tre-byval-parameter.ll @@ -0,0 +1,123 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -tailcallelim -verify-dom-info -S | FileCheck %s + +; the test was generated from the following C++ source: +; +; int zoo ( S p1 ); +; +; int foo ( int count, S p1 ) { +; if ( count > 10 ) +; return zoo(p1); +; +; // After TRE: temporarily variable created for passing byvalue parameter +; // p1 could be used when zoo(p1) is called. +; return foo(count+1, p1); +; } + +; this test checks that value of temporarily variable AGG_TMP_I +; (byVal value holder) is copied into another temporarily variable +; (AGG_TMP_I1). That is neccessary to copy data from variable with +; reduced scope (lifetime.start/lifetime.end). Specifically when +; "call i32 @_Z3fooi1S" is replaced with "br label tailrecurse" +; the value which were copied by "@llvm.memcpy.p0i8.p0i8.i64" into +; AGG_TMP_I should be later copied into AGG_TMP_I1. Since AGG_TMP_I +; is marked with lifetime.start/lifetime.end and could not be used +; later by: +; +; "[[P1_TR:%.*]] = phi %struct.S* [ [[P1:%.*]], [[ENTRY]] ], +; [ [[AGG_TMP_I1]], [[IF_END]] ]". + +%struct.S = type { i32, i32, float, %struct.B } +%struct.B = type { i32, float } + +; Function Attrs: uwtable +define dso_local i32 @_Z3fooi1S(i32 %count, %struct.S* nocapture readonly byval(%struct.S) align 8 %p1) local_unnamed_addr #0 { +; CHECK-LABEL: @_Z3fooi1S( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[AGG_TMP_I1:%.*]] = alloca [[STRUCT_S:%.*]], align 8 +; CHECK-NEXT: [[AGG_TMP_I:%.*]] = alloca [[STRUCT_S]], align 8 +; CHECK-NEXT: [[AGG_TMP14:%.*]] = alloca [[STRUCT_S]], align 8 +; CHECK-NEXT: [[AGG_TMP:%.*]] = alloca [[STRUCT_S]], align 8 +; CHECK-NEXT: [[AGG_TMP1:%.*]] = alloca [[STRUCT_S]], align 8 +; CHECK-NEXT: br label [[TAILRECURSE:%.*]] +; CHECK: tailrecurse: +; CHECK-NEXT: [[COUNT_TR:%.*]] = phi i32 [ [[COUNT:%.*]], [[ENTRY:%.*]] ], [ [[ADD:%.*]], [[IF_END:%.*]] ] +; CHECK-NEXT: [[P1_TR:%.*]] = phi %struct.S* [ [[P1:%.*]], [[ENTRY]] ], [ [[AGG_TMP_I1]], [[IF_END]] ] +; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[COUNT_TR]], 10 +; CHECK-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END]] +; CHECK: if.then: +; CHECK-NEXT: [[TMP0:%.*]] = bitcast %struct.S* [[AGG_TMP]] to i8* +; CHECK-NEXT: [[TMP1:%.*]] = bitcast %struct.S* [[P1_TR]] to i8* +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 dereferenceable(20) [[TMP0]], i8* nonnull align 8 dereferenceable(20) [[TMP1]], i64 20, i1 false) +; CHECK-NEXT: [[CALL:%.*]] = tail call i32 @_Z3zoo1S(%struct.S* nonnull byval(%struct.S) align 8 [[AGG_TMP]]) +; CHECK-NEXT: br label [[RETURN:%.*]] +; CHECK: if.end: +; CHECK-NEXT: [[ADD]] = add nsw i32 [[COUNT_TR]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast %struct.S* [[AGG_TMP1]] to i8* +; CHECK-NEXT: [[TMP3:%.*]] = bitcast %struct.S* [[P1_TR]] to i8* +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 dereferenceable(20) [[TMP2]], i8* nonnull align 8 dereferenceable(20) [[TMP3]], i64 20, i1 false) +; CHECK-NEXT: [[AGG_TMP14_0__SROA_CAST:%.*]] = bitcast %struct.S* [[AGG_TMP14]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 20, i8* nonnull [[AGG_TMP14_0__SROA_CAST]]) +; CHECK-NEXT: [[TMP4:%.*]] = bitcast %struct.S* [[AGG_TMP_I]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 20, i8* nonnull [[TMP4]]) +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 dereferenceable(20) [[AGG_TMP14_0__SROA_CAST]], i8* nonnull align 8 dereferenceable(20) [[TMP2]], i64 20, i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 dereferenceable(20) [[TMP4]], i8* nonnull align 8 dereferenceable(20) [[AGG_TMP14_0__SROA_CAST]], i64 20, i1 false) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast %struct.S* [[AGG_TMP_I1]] to i8* +; CHECK-NEXT: [[TMP6:%.*]] = bitcast %struct.S* [[AGG_TMP_I]] to i8* +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 20, i1 false) +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 20, i8* nonnull [[AGG_TMP14_0__SROA_CAST]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 20, i8* nonnull [[TMP4]]) +; CHECK-NEXT: br label [[TAILRECURSE]] +; CHECK: return: +; CHECK-NEXT: ret i32 [[CALL]] +; +entry: + %agg.tmp.i = alloca %struct.S, align 8 + %agg.tmp14 = alloca %struct.S, align 8 + %agg.tmp = alloca %struct.S, align 8 + %agg.tmp1 = alloca %struct.S, align 8 + %cmp = icmp sgt i32 %count, 10 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %0 = bitcast %struct.S* %agg.tmp to i8* + %1 = bitcast %struct.S* %p1 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 dereferenceable(20) %0, i8* nonnull align 8 dereferenceable(20) %1, i64 20, i1 false) + %call = call i32 @_Z3zoo1S(%struct.S* nonnull byval(%struct.S) align 8 %agg.tmp) + br label %return + +if.end: ; preds = %entry + %add = add nsw i32 %count, 1 + %2 = bitcast %struct.S* %agg.tmp1 to i8* + %3 = bitcast %struct.S* %p1 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 dereferenceable(20) %2, i8* nonnull align 8 dereferenceable(20) %3, i64 20, i1 false) + %agg.tmp14.0..sroa_cast = bitcast %struct.S* %agg.tmp14 to i8* + call void @llvm.lifetime.start.p0i8(i64 20, i8* nonnull %agg.tmp14.0..sroa_cast) + %4 = bitcast %struct.S* %agg.tmp.i to i8* + call void @llvm.lifetime.start.p0i8(i64 20, i8* nonnull %4) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 dereferenceable(20) %agg.tmp14.0..sroa_cast, i8* nonnull align 8 dereferenceable(20) %2, i64 20, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 dereferenceable(20) %4, i8* nonnull align 8 dereferenceable(20) %agg.tmp14.0..sroa_cast, i64 20, i1 false) + %call.i = call i32 @_Z3fooi1S(i32 %add, %struct.S* nonnull byval(%struct.S) align 8 %agg.tmp.i) + call void @llvm.lifetime.end.p0i8(i64 20, i8* nonnull %agg.tmp14.0..sroa_cast) + call void @llvm.lifetime.end.p0i8(i64 20, i8* nonnull %4) + br label %return + +return: ; preds = %if.end, %if.then + %retval.0 = phi i32 [ %call, %if.then ], [ %call.i, %if.end ] + ret i32 %retval.0 +} + +declare dso_local i32 @_Z3zoo1S(%struct.S* byval(%struct.S) align 8) local_unnamed_addr #1 + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #2 + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #2 + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #2 + +attributes #0 = { uwtable } +attributes #1 = { uwtable } +attributes #2 = { argmemonly nounwind willreturn } diff --git a/llvm/test/Transforms/TailCallElim/tre-multiple-exits.ll b/llvm/test/Transforms/TailCallElim/tre-multiple-exits.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/TailCallElim/tre-multiple-exits.ll @@ -0,0 +1,125 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -tailcallelim -verify-dom-info -S | FileCheck %s + +; This test checks that TRE would be done for only one recursive call. +; The test_multiple_exits function has three recursive calls. +; First recursive call could not be eliminated because there is +; escaped pointer to local variable. Second recursive call could +; be eliminated. Thrid recursive call could not be eliminated since +; this is not last call. Thus, test checks that TRE would be done +; for only second recursive call. + +; IR for that test was generated from the following C++ source: +; +; void capture_arg (int*); +; void test_multiple_exits (int param); +; if (param >= 0 && param < 10) { +; int temp; +; capture_arg(&temp); +; // TRE could not be done because pointer to local +; // variable "temp" is escaped. +; test_multiple_exits(param + 1); +; } else if (param >=10 && param < 20) { +; // TRE should be done. +; test_multiple_exits(param + 1); +; } else if (param >= 20 && param < 22) { +; // TRE could not be done since recursive +; // call is not last call. +; test_multiple_exits(param + 1); +; func(); +; } +; +; return; +; } + +; Function Attrs: noinline optnone uwtable +declare void @_Z11capture_argPi(i32* %param) #0 + +; Function Attrs: noinline optnone uwtable +declare void @_Z4funcv() #0 + +; Function Attrs: noinline nounwind uwtable +define dso_local void @_Z19test_multiple_exitsi(i32 %param) local_unnamed_addr #2 { +; CHECK-LABEL: @_Z19test_multiple_exitsi( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TEMP:%.*]] = alloca i32, align 4 +; CHECK-NEXT: br label [[TAILRECURSE:%.*]] +; CHECK: tailrecurse: +; CHECK-NEXT: [[PARAM_TR:%.*]] = phi i32 [ [[PARAM:%.*]], [[ENTRY:%.*]] ], [ [[ADD6:%.*]], [[IF_THEN5:%.*]] ] +; CHECK-NEXT: [[TMP0:%.*]] = icmp ult i32 [[PARAM_TR]], 10 +; CHECK-NEXT: br i1 [[TMP0]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TEMP]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull [[TMP1]]) #1 +; CHECK-NEXT: call void @_Z11capture_argPi(i32* nonnull [[TEMP]]) +; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[PARAM_TR]], 1 +; CHECK-NEXT: call void @_Z19test_multiple_exitsi(i32 [[ADD]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull [[TMP1]]) #1 +; CHECK-NEXT: br label [[IF_END14:%.*]] +; CHECK: if.else: +; CHECK-NEXT: [[PARAM_OFF:%.*]] = add i32 [[PARAM_TR]], -10 +; CHECK-NEXT: [[TMP2:%.*]] = icmp ult i32 [[PARAM_OFF]], 10 +; CHECK-NEXT: br i1 [[TMP2]], label [[IF_THEN5]], label [[IF_ELSE7:%.*]] +; CHECK: if.then5: +; CHECK-NEXT: [[ADD6]] = add nuw nsw i32 [[PARAM_TR]], 1 +; CHECK-NEXT: br label [[TAILRECURSE]] +; CHECK: if.else7: +; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[PARAM_TR]], -2 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[TMP3]], 20 +; CHECK-NEXT: br i1 [[TMP4]], label [[IF_THEN11:%.*]], label [[IF_END14]] +; CHECK: if.then11: +; CHECK-NEXT: [[ADD12:%.*]] = add nsw i32 [[PARAM_TR]], 1 +; CHECK-NEXT: tail call void @_Z19test_multiple_exitsi(i32 [[ADD12]]) +; CHECK-NEXT: tail call void @_Z4funcv() +; CHECK-NEXT: ret void +; CHECK: if.end14: +; CHECK-NEXT: ret void +; +entry: + %temp = alloca i32, align 4 + %0 = icmp ult i32 %param, 10 + br i1 %0, label %if.then, label %if.else + +if.then: ; preds = %entry + %1 = bitcast i32* %temp to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %1) #2 + call void @_Z11capture_argPi(i32* nonnull %temp) + %add = add nuw nsw i32 %param, 1 + call void @_Z19test_multiple_exitsi(i32 %add) + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %1) #2 + br label %if.end14 + +if.else: ; preds = %entry + %param.off = add i32 %param, -10 + %2 = icmp ult i32 %param.off, 10 + br i1 %2, label %if.then5, label %if.else7 + +if.then5: ; preds = %if.else + %add6 = add nuw nsw i32 %param, 1 + call void @_Z19test_multiple_exitsi(i32 %add6) + br label %if.end14 + +if.else7: ; preds = %if.else + %3 = and i32 %param, -2 + %4 = icmp eq i32 %3, 20 + br i1 %4, label %if.then11, label %if.end14 + +if.then11: ; preds = %if.else7 + %add12 = add nsw i32 %param, 1 + call void @_Z19test_multiple_exitsi(i32 %add12) + call void @_Z4funcv() + br label %if.end14 + +if.end14: ; preds = %if.then5, %if.then11, %if.else7, %if.then + ret void +} + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #2 + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #2 + +attributes #0 = { nofree noinline norecurse nounwind uwtable } +attributes #1 = { nounwind uwtable } +attributes #2 = { argmemonly nounwind willreturn } diff --git a/llvm/test/Transforms/TailCallElim/tre-noncapturing-alloca-calls.ll b/llvm/test/Transforms/TailCallElim/tre-noncapturing-alloca-calls.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/TailCallElim/tre-noncapturing-alloca-calls.ll @@ -0,0 +1,74 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -tailcallelim -verify-dom-info -S | FileCheck %s + +; IR for that test was generated from the following C++ source: +; +;int count; +;__attribute__((noinline)) void globalIncrement(const int* param) { count += *param; } +; +;void test(int recurseCount) +;{ +; if (recurseCount == 0) return; +; int temp = 10; +; globalIncrement(&temp); +; test(recurseCount - 1); +;} +; + +@count = dso_local local_unnamed_addr global i32 0, align 4 + +; Function Attrs: nofree noinline norecurse nounwind uwtable +declare void @_Z15globalIncrementPKi(i32* nocapture readonly %param) #0 + +; Test that TRE could be done for recursive tail routine containing +; call to function receiving a pointer to local stack. + +; Function Attrs: nounwind uwtable +define dso_local void @_Z4testi(i32 %recurseCount) local_unnamed_addr #1 { +; CHECK-LABEL: @_Z4testi( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TEMP:%.*]] = alloca i32, align 4 +; CHECK-NEXT: br label [[TAILRECURSE:%.*]] +; CHECK: tailrecurse: +; CHECK-NEXT: [[RECURSECOUNT_TR:%.*]] = phi i32 [ [[RECURSECOUNT:%.*]], [[ENTRY:%.*]] ], [ [[SUB:%.*]], [[IF_END:%.*]] ] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[RECURSECOUNT_TR]], 0 +; CHECK-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[IF_END]] +; CHECK: if.end: +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[TEMP]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull [[TMP0]]) +; CHECK-NEXT: store i32 10, i32* [[TEMP]], align 4 +; CHECK-NEXT: call void @_Z15globalIncrementPKi(i32* nonnull [[TEMP]]) +; CHECK-NEXT: [[SUB]] = add nsw i32 [[RECURSECOUNT_TR]], -1 +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull [[TMP0]]) +; CHECK-NEXT: br label [[TAILRECURSE]] +; CHECK: return: +; CHECK-NEXT: ret void +; +entry: + %temp = alloca i32, align 4 + %cmp = icmp eq i32 %recurseCount, 0 + br i1 %cmp, label %return, label %if.end + +if.end: ; preds = %entry + %0 = bitcast i32* %temp to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %0) #6 + store i32 10, i32* %temp, align 4 + call void @_Z15globalIncrementPKi(i32* nonnull %temp) + %sub = add nsw i32 %recurseCount, -1 + call void @_Z4testi(i32 %sub) + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %0) #6 + br label %return + +return: ; preds = %entry, %if.end + ret void +} + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #2 + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #2 + +attributes #0 = { nofree noinline norecurse nounwind uwtable } +attributes #1 = { nounwind uwtable } +attributes #2 = { argmemonly nounwind willreturn }