Index: include/llvm-c/Transforms/Scalar.h =================================================================== --- include/llvm-c/Transforms/Scalar.h +++ include/llvm-c/Transforms/Scalar.h @@ -123,6 +123,9 @@ /** See llvm::createTailCallEliminationPass function. */ void LLVMAddTailCallEliminationPass(LLVMPassManagerRef PM); +/** See llvm::createRecursionStackEliminationinationPass function. */ +void LLVMAddRecursionStackEliminationinationPass(LLVMPassManagerRef PM); + /** See llvm::createConstantPropagationPass function. */ void LLVMAddConstantPropagationPass(LLVMPassManagerRef PM); Index: include/llvm/InitializePasses.h =================================================================== --- include/llvm/InitializePasses.h +++ include/llvm/InitializePasses.h @@ -382,6 +382,7 @@ void initializeStripSymbolsPass(PassRegistry&); void initializeStructurizeCFGPass(PassRegistry&); void initializeTailCallElimPass(PassRegistry&); +void initializeRecursionStackEliminationPass(PassRegistry&); void initializeTailDuplicatePass(PassRegistry&); void initializeTargetLibraryInfoWrapperPassPass(PassRegistry&); void initializeTargetPassConfigPass(PassRegistry&); Index: include/llvm/Transforms/Scalar.h =================================================================== --- include/llvm/Transforms/Scalar.h +++ include/llvm/Transforms/Scalar.h @@ -280,6 +280,14 @@ //===----------------------------------------------------------------------===// // +// RecursionStackEliminationination - This pass eliminates 2 or more call +// instructions to the current function which occur immediately before return +// instructions, by explizitly modelling the a queue +// +FunctionPass *createRecursionStackEliminationPass(); + +//===----------------------------------------------------------------------===// +// // EarlyCSE - This pass performs a simple and fast CSE pass over the dominator // tree. // Index: include/llvm/Transforms/Scalar/RecursionStackElimination.h =================================================================== --- /dev/null +++ include/llvm/Transforms/Scalar/RecursionStackElimination.h @@ -0,0 +1,27 @@ +//===---- TODO ----------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TODO +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_SCALAR_RECURSIONSTACKELIMINATION_H +#define LLVM_TRANSFORMS_SCALAR_RECURSIONSTACKELIMINATION_H + +#include "llvm/IR/Function.h" +#include "llvm/IR/PassManager.h" + +namespace llvm { + +struct RecursionStackEliminationPass : PassInfoMixin { + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; +} + +#endif // LLVM_TRANSFORMS_SCALAR_RECURSIONSTACKELIMINATION_H Index: lib/Passes/PassBuilder.cpp =================================================================== --- lib/Passes/PassBuilder.cpp +++ lib/Passes/PassBuilder.cpp @@ -137,6 +137,7 @@ #include "llvm/Transforms/Scalar/NewGVN.h" #include "llvm/Transforms/Scalar/PartiallyInlineLibCalls.h" #include "llvm/Transforms/Scalar/Reassociate.h" +#include "llvm/Transforms/Scalar/RecursionStackElimination.h" #include "llvm/Transforms/Scalar/RewriteStatepointsForGC.h" #include "llvm/Transforms/Scalar/SCCP.h" #include "llvm/Transforms/Scalar/SROA.h" @@ -358,6 +359,12 @@ assert(Level != O0 && "Must request optimizations!"); FunctionPassManager FPM(DebugLogging); + FPM.addPass(SROA()); // Try to get rid of allocas + FPM.addPass(SimplifyCFGPass()); // Merge & remove BBs + FPM.addPass(RecursionStackEliminationPass()); + FPM.addPass(ADCEPass()); // Delete dead instructions + FPM.addPass(SimplifyCFGPass()); // Merge & remove BBs + // Form SSA out of local memory accesses after breaking apart aggregates into // scalars. FPM.addPass(SROA()); Index: lib/Transforms/IPO/PassManagerBuilder.cpp =================================================================== --- lib/Transforms/IPO/PassManagerBuilder.cpp +++ lib/Transforms/IPO/PassManagerBuilder.cpp @@ -546,6 +546,12 @@ MPM.add(createArgumentPromotionPass()); // Scalarize uninlined fn args addExtensionsToPM(EP_CGSCCOptimizerLate, MPM); + + MPM.add(createSROAPass()); + MPM.add(createCFGSimplificationPass()); // Merge & remove BBs + MPM.add(createRecursionStackEliminationPass()); + MPM.add(createAggressiveDCEPass()); // Delete dead instructions + addFunctionSimplificationPasses(MPM); // FIXME: This is a HACK! The inliner pass above implicitly creates a CGSCC Index: lib/Transforms/Scalar/CMakeLists.txt =================================================================== --- lib/Transforms/Scalar/CMakeLists.txt +++ lib/Transforms/Scalar/CMakeLists.txt @@ -55,6 +55,7 @@ Reassociate.cpp Reg2Mem.cpp RewriteStatepointsForGC.cpp + RecursionStackElimination.cpp SCCP.cpp SROA.cpp Scalar.cpp Index: lib/Transforms/Scalar/RecursionStackElimination.cpp =================================================================== --- /dev/null +++ lib/Transforms/Scalar/RecursionStackElimination.cpp @@ -0,0 +1,2158 @@ +//===- RecursionStackElimination.cpp - Eliminate multiple tail recursions -===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass converts multiple tail recursive calls into a loop, by modeling the +// calls as a single linked worklist explicitly on the stack. +// +// void f(a, b, const c) { +// ... +// if (...) +// return +// +// a1, b1, b2, b3, x2, x3 = ... +// +// f(a1, b1, c) // 1st recursion +// a2 = h(x2) +// f(a2, b2, c) // 2nd recursion +// a3 = h(x3) +// f(a3, b3, c) // 3rd recursion +// } +// +// -> +// +// void f(a, b, const c) { +// struct { x, b, next } *worklist = null +// +// loop: +// ... +// if (...) +// goto check_return +// +// a1, b1, b2, b3, x2, x3 = ... +// +// /* Assign arguments for the first call */ +// +// a = a1 +// b = b1 +// +// /* Put arguments of the remaining calls into the work list */ +// +// queue = alloca(2 * sizeof(*worklist)) +// queue[0] = {x2, b2, &queue[1]} +// queue[1] = {x3, b3, worklist} +// worklist = queue +// +// goto loop +// +// check_return: +// if (!worklist) +// return +// a = h(worklist->x) +// b = worklist->b +// worklist = worklist->next +// +// goto loop +// } +// +// Such patterns occur for example when an application traverses k-ary full trees. +// +// The benefits of this transformation is, that neither frame nor link address +// have to be stored on the stack. Also pass through arguments, like 'const c' +// in example above, are less likely required to saved onto the stack, and if so +// less often (only once in the entry block). +// +// The downsides are: +// +// a) An additional store for the worklist is required. +// +// b) The worst case required stack memory after this transformation depends on +// the number of recursive calls, instead of the recursion depth. +// +// c) Additional conditional branches +// +// ad a) This point is compensated by avoiding storing the link address in each +// call. +// +// ad b) This pass additionally can add (and does it by default) code to manage +// unused blocks of allocas in a freelist. Because allocations are done in +// blocks the freelist management, can also be done in blocks. This requires +// the last item in each block to be marked, detectable by the code within +// return. Currently the following mark algorithms are implemented: +// +// A further variable is stored on the stack containing the current +// freelist. On the other hand by not executing a recursion the frame pointer +// loads and stores are omitted. +// +// * TrueMarker: This algorithm marks each element and is applicable to binary +// recursions. +// +// * FalseMarker: This algorithm marks no element, and is applicable when +// freelist management is disabled. +// +// * FieldMarker: A separate bit is allocated in the worklist. This marker +// requires additional instructions but can be used in the general case. +// +// * CompareMarker: Assuming allocas return memory addresses in a strictly +// monotonic order. The freelist can be modeled to return the same order +// when pulling elements from it. Comparing each worklist with worklist.next +// can then reveal the information if the element is marked. +// +// * TaggedMarker: (not yet implemented) Similar to the FieldMarker this +// marker marks the last item by a bit. But instead of using a separate bit +// it uses Bit 0 of the worklist field. If the alignment of the worklist is +// a power of 2, and if it is >= 2, this marker can also cover the general +// case. It requires some additional bit masking but no additional memory +// operations. +// +// ad c) The pass adds 1 conditional branch into the return path and 2 +// additional branches for freelist management (see (b) above). Depending on +// the target, machine branch prediction can elevate this. +// +// Algorithm outline: +// ------------------ +// +// Analysis Phase: +// +// 1) Analyze the function and gather the returning basic blocks (BB) (and BB +// branching to return-only BB) with recursive calls (RC). +// +// 2) If more the one BB or no RC is found abandon the transformation. +// +// 3) If the remaining BB has only one RC abandon the transformation. Otherwise +// let N be number of RC in this BB. +// +// 4) Analyze the instructions in this BB and from the first RC until the +// terminator instruction and classify each instruction as movable and static. +// +// A movable instruction is and instruction that can be safely moved before +// the first RC. All other instructions are classified static. +// +// 5) Assign each static instruction to the following RC instruction. If static +// instructions are left after the last RC abandon the transformation. +// +// 6) Build the function H with all its arguments for each RC. By including the +// call itself in function H it is ensured that this function and enforcing +// this function to return void. It is ensured that there are no escaping +// values uses after the recursion. +// +// 6.1) Note: By the way step 4 is executed it is guaranteed that function H for +// the first RC consists of a single instruction; the call itself. The first +// call candidate is handled special (the same way as in Tail Recursion +// Elimination (TRE)). +// +// 5,6) Note: The information collected on each RC is collected in the structure +// RecursiveCall. +// +// 7) Compare the second's function H with all later ones. The behavior must +// match, otherwise abandon the transformation. +// +// 7.1) Note: As the first RCs function H basically a TRE it can be ignored in this +// step. +// +// 8) Simplify the argument list by removing constants and pass through arguments. +// +// 9) Decide whether it is profitable to use the transformation. +// +// Transformation Phase: +// +// 1) Adjust entry block and split of the loop entry. +// +// 2) Eliminate the first RC (similar to TRE). +// +// 3) Eliminate the remaining RC by allocating and filling an array new (or pick +// it from the freelist) block of N-1 struct items. This array is put in the +// front of the list. Pulling the list is added in (4). The execution of +// function H is ensured in (5). +// +// 4) Create a new return block which pulls items from the worklist. If an and +// of block marker is reached. The block is put into the freelist. +// +// 5) Add the instruction from function H into the return block and create the +// loop. +// +// 6) Redirect each returning block into the new return block created in (4). +// +// 7) Drop the constant STACKGROWTHDIRECION. It is manly uses as a proof of +// concept for Aarch64. +// +// Open issues and known TODOs - It would be great if reviewer could comment on +// those as well: +// +// 1) Pipeline integration: Currently the pass is put before TRE. This includes +// some supporting passes, for cleanup and preparation. This cannot be left +// as is. The preferred way could be by adjusting +// "AArch64TargetMachine::adjustPassManager" and only use it with +// O3. AAarch64 is selected, because this is the architecture the author has +// suitable benchmarking and test setups available. This was tried once (in a +// similar way as in AMDGPUTargetMachine::adjustPassManager), however the +// result was that may LLVM projects did not compile anymore because of +// linker problems (Passes library was missing). Do you have any advise here? +// +// 2) The way to test if it profitable to use the pass needs adjustment. I think +// that functions, that spill more registers have an increased chance to +// profit, while functions that spill less, have lower chance for profit. +// +// 3) Thinking of a configurable way (maybe a separate marker class) to adjust +// the way markers implemented. E.g.: putting the marker bit into the pointer +// on Aarch64 show a significant performance boost (test implemented in C). +// +// 4) Is it safe to temporary create function and return no-changes after +// deleting it? If not is there a better way to than calling using the +// FunctionComparator? +// +// 5) GlobalsAA needs to preserved. Not sure about this in this context. Loads +// and Stores are added here. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar/RecursionStackElimination.h" +#include "llvm/Pass.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/Loads.h" +#include "llvm/Analysis/MemorySSA.h" +#include "llvm/Analysis/MemorySSAUpdater.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/CodeGen/TargetFrameLowering.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/FunctionComparator.h" + +#include +#include + +using namespace llvm; + +#define DEBUG_TYPE "recstackelim" + +namespace { +// TODO (correctness) fetch constant from TargetInfo. However this is a CodeGen +// Information and not yet available. How to Solve? +const TargetFrameLowering::StackDirection STACKDIRECTION = TargetFrameLowering::StackGrowsDown; + + +/// Option to disable the pass. +static cl::opt +DisableRecursionStackElimination("disable-" DEBUG_TYPE, cl::init(false), + cl::Hidden, cl::desc("Disable stack recursion elimination")); + +/// Option to disable freelist handling. +static cl::opt +DisableStackCleanup("disable-" DEBUG_TYPE "-stack-cleanup", cl::init(false), + cl::Hidden, cl::desc("Disable stack recursion elimination")); + +/// Local implementation class. +struct RecursionStackEliminationImpl { + RecursionStackEliminationImpl(Function &F, AliasAnalysis &AA, OptimizationRemarkEmitter &ORE) : + DL(F.getParent()->getDataLayout()), + C(F.getParent()->getContext()), + ORE(ORE), + F(F), + AA(AA) {} + + bool run() { + // Check if feasible and profitable, + if (analyzeFunction()) { + // then eliminate the recursion. + eliminateRecursion(); + return true; + } + + return false; + } + +private: + /// Helper class to collect information on a single recursive call site. This + /// class also stores the required properties of the function H (see + /// introduction's description of this pass). + struct RecursiveCall { + private: + /// Hide copy constructor + RecursiveCall(const RecursiveCall &O) : Index(O.Index), F(O.F), H(O.H), CI(O.CI), Is() { + llvm_unreachable("RecursiveCall cannot be copied"); + } + + public: + RecursiveCall(CallInst *CI, size_t Index) : Index(Index), F(*CI->getParent()->getParent()), H(nullptr), CI(CI), Is() {} + + ~RecursiveCall() { + // If function H was already created, delete it. It is only used temporary. + if (H) + H->eraseFromParent(); + } + + /// Add an instruction to the list representing the function H. + void addInstruction(Instruction *I) { + Is.push_back(I); + } + + /// Returns the recursive call instruction. + CallInst *getCallInst() const { + return CI; + } + + /// Test if the value V is part of function H. Because H is not created these + /// instructions are denoted "local" + bool isLocalValue(Value *V, unsigned *Idx = nullptr) { + auto LocalInstructionIt = std::find(Is.begin(), Is.end(), V); + + if (LocalInstructionIt == Is.end()) + return false; + + if (Idx) + *Idx = LocalInstructionIt - Is.begin(); + + return true; + } + + /// Get the list operands of that are required for calling the function H + const std::vector &getIncomingOperands() const { + return IncomingOperands; + } + + /// If possible create the function H (including the final recursive call + /// instruction). If it succeeds buildFunctionH returns true and assigns it to + /// the member variable H. The created function is purely temporary. Its + /// lifetime is bound to this object. The created function always returns void. + bool buildFunctionH() { + if (H) + H->eraseFromParent(); + H = nullptr; + + // Get the arguments list from H + SmallVector HArgs; + for (auto &U: IncomingOperands) + HArgs.push_back(U->get()->getType()); + + // Get the function type and create it + FunctionType *HTy = FunctionType::get(Type::getVoidTy(F.getContext()), makeArrayRef(HArgs), F.isVarArg()); + H = Function::Create(HTy, GlobalValue::PrivateLinkage, F.getAddressSpace(), F.getName() + ".H" + Twine(Index), F.getParent()); + BasicBlock *NewBB = BasicBlock::Create(H->getContext(), "entry", H); + + // Value-Value map to match instruction operands between the source and clone. + std::map VVMap; + + // Define a lambda used to clone an instruction. If leave constants is true + // those are taken from the argument list if possible. + auto cloneAndAppendInst = [this, &VVMap, NewBB] (Instruction *I, bool leaveConstants) { + // We do not allow any uses outside the newly created function. Otherwise + // the function would return some non void value. + for (Use &U : I->uses()) { + if (!isLocalValue(U.getUser()) && U.getUser() != CI) { + H->eraseFromParent(); + H = nullptr; + return false; + } + } + + // Clone the instruction, assign it to the newly created function and + // adjust all operands. + Instruction *NewI = I->clone(); + VVMap[I] = NewI; + NewBB->getInstList().push_back(NewI); + for (Use &U: NewI->operands()) { + // Keep constant operands as is. + if (leaveConstants && isa(U.get())) + continue; + + // Replace other operands by their counter part in the new function + if (isLocalValue(U.get())) { + auto VVMI = VVMap.find(U.get()); + if (VVMI == VVMap.end()) { + H->eraseFromParent(); + H = nullptr; + return false; + } + + U.set(VVMI->second); + } + else { + auto IOI = std::find_if(IncomingOperands.begin(), IncomingOperands.end(), [&U] (const Use* S) -> bool { + return S->get() == U.get(); + }); + + // If the operand is not found, ignore it if and only if it is a constant. + if (IOI == IncomingOperands.end() && isa(U.get())) + continue; + assert(IOI != IncomingOperands.end() && "Operand not in argument list"); + size_t ArgIdx = IOI - IncomingOperands.begin(); + U.set(H->arg_begin() + ArgIdx); + } + } + + return true; + }; + // EOL - Lambda cloneAndAppendInst + + // Clone all instructions into the new function, + for (auto &I: Is) { + if (!cloneAndAppendInst(I, true)) + return false; + } + // but allow constant arguments for the recursive call inst. + if (!cloneAndAppendInst(CI, false)) + return false; + + // Finally, append the terminator instruction. + ReturnInst::Create(H->getContext(), NewBB); + + return true; + } + + /// Return function H and create it if necessary. + Function *getFunctionH() { + if (!H) { + buildFunctionH(); + } + return H; + } + + /// Remove duplicates within the operands while preserving the order of each + /// first occurrence. The operands a given as use. For duplicates and + /// arbitrary use is selected to be kept. + void cleanupIncomingOperands() { + std::map map; + std::vector tmp(IncomingOperands.begin(), IncomingOperands.end()); + IncomingOperands.clear(); + for (auto &O: tmp) { + bool inserted; + std::tie(std::ignore, inserted) = map.insert(std::make_pair(O->get(), O)); + if (inserted) + IncomingOperands.push_back(O); + } + } + + /// Collect all required operand for instruction I. If allowConstants is set, + /// Constants are also put on the operand list. + void analyzeOperands(User::op_range Operands, Instruction *I, bool allowConstants) { + for (auto &U: Operands) { + if (!allowConstants && isa(U.get())) + continue; + + // If the operand is a non-local value add it to the list. This covers all + // movable instructions, as well as static instructions of other previous + // candidates. For movable instructions this is feasible because they + // instructions will remain (moved). On the other hand static instruction + // are removed. Referring to those is invalid. An example is depicted in + // the sequence below. + // + // ; Candidate 0 + // call @recursive(%donotcare) + // ; ... + // ; + // ; Candidate n + // %a = load %something + // %b = load %somethingelse + // call @recursive(%a) + // ; ... + // ; + // ; Candidate m > n + // call @recursive(%b) + // + // This case is not detected within this call. However, because the + // function H of candidate n would have to return a non-void value, it is + // covered during buildFunctioH. + if (!isLocalValue(U.get())) { + IncomingOperands.push_back(&U); + } + } + } + + /// Analyze if this candidate can be feasible for this transformation. + bool analyze() { + IncomingOperands.clear(); + for (auto &I: Is) { + // In general it is not safe to replace a constant by an arbitrary value + // in the LLVM IR. allowConstants is there for set to false to avoid + // putting them onto the argument list. + analyzeOperands(I->operands(), I, false); + } + // As we know that all candidates are recursive only arg_operands are + // analyzed for the recursive call instruction. Also for CallInst + // arg_operands it is feasible and potentially profitable to replace + // constants with values. E.g.: + // + // call @recursive(%this, 0); + // call @recursive(%this, 1); + // ... + // call @recursive(%this, n); + // ret + analyzeOperands(CI->arg_operands(), CI, true); + + // Remove duplicates from operand list. + cleanupIncomingOperands(); + + // Try to encapsulate call and all static instruction into an new separate function. + if (!buildFunctionH()) + return false; + + return true; + } + + /// Test if O's function H behaves equally compared to the own function H. + bool compareFunctionH(RecursiveCall *O) const { + GlobalNumberState GN; + return FunctionComparator(H, O->H, &GN).compare() == 0; + } + + /// Erase the CallInst from the source function. + void eraseSourceCallInst() { + CI->eraseFromParent(); + CI = nullptr; + } + + /// Erase all static instructions from the source function. + void eraseSourceInstructions() { + eraseSourceCallInst(); + for (auto i = Is.rbegin(), e = Is.rend(); i != e; /* inloop */) { + (*(i++))->eraseFromParent(); + } + Is.clear(); + } + + size_t getIndex() const { + return Index; + } + + private: + /// Index, solely used for naming + size_t Index; + + /// Recursive function the candidate is called within + Function &F; + + /// Temporary function H (including the recursive call). This function is used + /// during analysis and transformation. It is never called and destroyed when + /// this object is deleted. + Function *H; + + /// Recursive call instruction. + CallInst *CI; + + /// Instructions that precede the current call. + SmallVector Is; + std::vector IncomingOperands; + }; + + /// Helper class to collect and classify used arguments and analyze arguments + /// used by function H. + struct IncomingArgument { + /// Stacked arguments are arguments that need to be modeled on the stack + /// + /// Static arguments are arguments which are passed through to all + /// recursive calls without changing position nor value. + /// + /// e.g.: + /// fn(a, b, c) { + /// fn(a, b, c); + /// fn(a + 1, a, c); + /// } + /// => StaticArguments = {c} + /// + /// Constant arguments are Constants that are the equal for each call except the first. + /// + /// UnknownKind are arguments which are not yet classified. + enum Kind { + StackedKind, + StaticKind, + ConstantKind, + UnknownKind, + }; + + IncomingArgument() : Uses(), K(UnknownKind) {} + + /// Add an argument use and update the current classification. + void addUse(const RecursiveCall *CC, Use *U, bool CheckOnly = false) { + Value *V = U->get(); + switch (K) { + case UnknownKind: + K = StackedKind; + if (isa(V)) { + K = ConstantKind; + } + else if (Argument *A = dyn_cast(V)) { + if (A == CC->getCallInst()->getArgOperand(A->getArgNo())) { + K = StaticKind; + } + } + Ty = V->getType(); + break; + + case StackedKind: + // Keep Stacked as Stacked. + break; + + case ConstantKind: + // Change kind to stacked if the new argument is no constant or if the + // constant value differs. + if (Constant *C = dyn_cast(V)) { + if (C != getValue()) + K = StackedKind; + } + else { + K = StackedKind; + } + break; + + case StaticKind: + // Change kind to stacked if the new argument is no argument or if the + // the position in the corresponding CallInst is unsuitable. + if (Argument *A = dyn_cast(V)) { + if (A != CC->getCallInst()->getArgOperand(A->getArgNo())) { + K = StackedKind; + } + } + else { + K = StackedKind; + } + break; + } + assert(Ty == V->getType() && "Argument type mismatch"); + if (!CheckOnly) + Uses.insert(std::make_pair(CC, U)); + } + + /// Get the used value of a specific candidate, or the used value of any + /// candidate. + Value *getValue(const RecursiveCall *CC = nullptr) const { + if (CC) + return Uses.find(CC)->second->get(); + else + return Uses.begin()->second->get(); + } + + /// Get the type of this argument. + Type *getType() const { + return Ty; + } + + /// Returns true it this argument is of stacked kind. + bool isStacked() const { + assert(K != UnknownKind && "Argument kind is unknown"); + return K == StackedKind; + } + + /// Returns true it this argument is of static kind. + bool isStatic() const { + assert(K != UnknownKind && "Argument kind is unknown"); + return K == StaticKind; + } + + /// Returns true it this argument is of constant kind. + bool isConstant() const { + assert(K != UnknownKind && "Argument kind is unknown"); + return K == ConstantKind; + } + + private: + /// Candidate-Use map of the argument. + std::map Uses; + + /// Classification. + Kind K; + + /// Argument type. + Type *Ty; + }; + + struct TrueMarker { + using ItemListContainer = SmallVectorImpl; + + TrueMarker(LLVMContext &C, unsigned OtherCandidatesNumOf) : C(C), OtherCandidatesNumOf(OtherCandidatesNumOf) {} + virtual ~TrueMarker() {} + + virtual bool mayPullFromFreeList() { + // this is the default behavior + return true; + } + + virtual bool checkListType(Type *ListTy) { + return true; + } + + virtual void adjustStructFields(ItemListContainer &ItemList) {}; + + virtual void addSetupMarkerInst(Value *LinkPtr, unsigned EntryIndex, + Instruction *InsertPos, const DebugLoc &DebugLoc, const DataLayout &DL) { + } + + virtual Value *addComputeMarkInst(Value *ListItem, Value *ListHead, + Value *NextPtr, Instruction *InsertPos, const DebugLoc &DebugLoc) { + return ConstantInt::getTrue(IntegerType::get(C, 1)); + } + + virtual unsigned getItemIndex(unsigned Item) { + return Item; + } + + LLVMContext &C; + const unsigned OtherCandidatesNumOf; + }; + + struct FalseMarker : public TrueMarker { + using base = TrueMarker; + + FalseMarker(LLVMContext &C, unsigned OtherCandidatesNumOf) : base(C, OtherCandidatesNumOf) {} + + bool mayPullFromFreeList() override { + return false; + } + + Value *addComputeMarkInst(Value *ListItem, Value *ListHead, + Value *NextPtr, Instruction *InsertPos, const DebugLoc &DebugLoc) override { + return ConstantInt::getFalse(IntegerType::get(C, 1)); + } + }; + + struct FieldMarker : public TrueMarker { + using base = TrueMarker; + + FieldMarker(LLVMContext &C, unsigned OtherCandidatesNumOf) : base(C, OtherCandidatesNumOf), Index(-1) {} + + void adjustStructFields(ItemListContainer &ItemList) override { + Index = ItemList.size(); + ItemList.push_back(IntegerType::get(C, 1)); + } + + void addSetupMarkerInst(Value *LinkPtr, unsigned EntryIndex, + Instruction *InsertPos, const DebugLoc &DebugLoc, const DataLayout &DL) override { + Value *Idx[2] = { + ConstantInt::get(DL.getIntPtrType(C, DL.getAllocaAddrSpace()), 0), + ConstantInt::get(IntegerType::get(C, 32), Index) }; + + Type *LinkTy = cast(LinkPtr->getType())->getElementType(); + Instruction *MarkPtrGEP = GetElementPtrInst::CreateInBounds(LinkTy, + LinkPtr, + makeArrayRef(Idx, 2), + "rse.markptr." + Twine(EntryIndex), + InsertPos); + MarkPtrGEP->setDebugLoc(DebugLoc); + + Value *Mark; + if (EntryIndex < OtherCandidatesNumOf - 1) { + Mark = ConstantInt::getFalse(IntegerType::get(C, 1)); + } + else { + Mark = ConstantInt::getTrue(IntegerType::get(C, 1)); + } + + StoreInst *SMI = new StoreInst(Mark, + MarkPtrGEP, + false, + DL.getABITypeAlignment(Mark->getType()), + InsertPos); + SMI->setDebugLoc(DebugLoc); + } + + Value *addComputeMarkInst(Value *ListItem, Value *ListHead, + Value *NextPtr, Instruction *InsertPos, const DebugLoc &DebugLoc) override { + assert(Index != (unsigned)-1 && "Marker field index unknown"); + + Instruction *Mark = ExtractValueInst::Create( + ListItem, makeArrayRef(&Index, 1), "rse.mark", InsertPos); + Mark->setDebugLoc(DebugLoc); + return Mark; + } + + unsigned Index; + }; + + struct CompareMarker : public TrueMarker { + using base = TrueMarker; + + CompareMarker(LLVMContext &C, unsigned OtherCandidatesNumOf) : base(C, OtherCandidatesNumOf) {} + + unsigned getItemIndex(unsigned Item) override { + if (STACKDIRECTION == TargetFrameLowering::StackGrowsDown) + return OtherCandidatesNumOf - 1 - Item; + else + return Item; + } + + Value *addComputeMarkInst(Value *ListItem, Value *ListHead, + Value *NextPtr, Instruction *InsertPos, const DebugLoc &DebugLoc) override { + Instruction *Mark; + if (STACKDIRECTION == TargetFrameLowering::StackGrowsDown) + // TODO (Correctness): What if NextPtr == nullptr, this is currently a + // false negative which unnecessarily spills the stack. This case has to + // be handled efficiently (e.g.: use (unsigned)-1 instead of nullptr, + // iff this is valid on the target machine) + Mark = new ICmpInst(InsertPos, CmpInst::ICMP_ULT, ListHead, NextPtr, "res.mark"); + else + Mark = new ICmpInst(InsertPos, CmpInst::ICMP_UGT, ListHead, NextPtr, "res.mark"); + Mark->setDebugLoc(DebugLoc); + return Mark; + } + }; + + typedef std::forward_list CandidateContainer; + + /// Current DataLayout. + const DataLayout &DL; + + /// Current Context. + LLVMContext &C; + + /// Optimization Remark Emitter to use. + OptimizationRemarkEmitter &ORE; + + /// Function the pass is working on. + Function &F; + + /// Alias Analysis of the current function. + AliasAnalysis &AA; + + /// Type of list elements for the selected marker algorithm; + StructType *ListTy; + + /// List of call candidates to be eliminated, this list contains the single + /// first candidate. + CandidateContainer FirstCandidate; + + /// List of the remaining call candidates to be eliminated. + CandidateContainer OtherCandidates; + + /// Number of elements in the OtherCandidates List + unsigned OtherCandidatesNumOf; + + /// List of arguments for the function H. + SmallVector IncomingArguments; + + /// List of movable instructions. + SmallVector MovableInst; + + /// List of static instructions. + SmallVector StaticInst; + + /// Selected Marker Algorithm + std::unique_ptr Marker; + + /// Find all recursive calls within the current basic block and add them to + /// the list of candidates. + /// + /// \returns true if at leas one recursive call is found. + bool appendAllRecursiveCallsInBB(Function &F, + BasicBlock &BB, + unsigned &NumOf, + CandidateContainer &Candidates, + CandidateContainer::iterator &LastCandidate) { + bool found = false; + for (auto &I: BB) { + CallInst *CI = dyn_cast(&I); + if (CI && CI->getCalledFunction() == &F) { + LastCandidate = Candidates.emplace_after(LastCandidate, CI, NumOf); + NumOf++; + found = true; + } + } + return found; + } + + /// Test if the given basic block is a return only BB. + bool isReturnOnlyBB(const BasicBlock &BB) const { + const ReturnInst *RI = dyn_cast(BB.getTerminator()); + return (RI != nullptr && BB.getFirstNonPHIOrDbg() == RI); + } + + /// Return true if it is safe to move the specified + /// instruction from after the call to before the call, assuming that all + /// instructions between the call and this instruction are movable. + /// + /// This is a copy of canMoveAboveCall within TailRecursionElimination + /// + /// \returns true if the instruction can be moved. + bool canMoveAbove(Instruction *I, Instruction *SI) { + // FIXME: We can move load/store/call/free instructions above the call if the + // call does not mod/ref the memory location being processed. + if (I->mayHaveSideEffects()) // This also handles volatile loads. + return false; + + if (LoadInst *L = dyn_cast(I)) { + // Loads may always be moved above calls without side effects. + if (SI->mayHaveSideEffects()) { + const DataLayout &DL = L->getModule()->getDataLayout(); + if (isModSet(AA.getModRefInfo(SI, MemoryLocation::get(L))) || + !isSafeToLoadUnconditionally(L->getPointerOperand(), + L->getAlignment(), DL, L)) + return false; + } + } + + // Otherwise, if this is a side-effect free instruction, check to make sure + // that it does not use the return value of the call. If it doesn't use the + // return value of the call, it must only use things that are defined before + // the call, or movable instructions between the call and the instruction + // itself. + return !is_contained(I->operands(), SI); + } + + /// Analyze the current function and return true if it is feasible and + /// profitable to apply the transformation. + /// + /// \returns the number of candidates found. + bool analyzeFunction() { + // Number of basic blocks with all recursive calls + int BBwithCandidates = 0; + bool hasAllocas = false; + + CandidateContainer Candidates; + auto LastCandidate = Candidates.before_begin(); + + // We cannot handle variadic arguments. + if (F.isVarArg()) + return false; + + // TODO (improvement): There might be ways to implement non void returns. For example + // similar to accumulator TREs. + if (F.getReturnType() != Type::getVoidTy(F.getContext())) + return false; + + // No, can't be done. + if (F.callsFunctionThatReturnsTwice()) + return false; + + unsigned CandidatesNumOf = 0; + for (auto &BB: F) { + // EH pads cannot be handled at all + if (BB.isEHPad()) { + return false; + } + + // Alloca instructions are not allowed. + // + // TODO (improvement): Alloca can be theoretically be handled if it can be + // proven that the used memory lifetime is bounded before the first call. + // In this case the alloca can be moved into the new entry block the same + // way tail recursion does. + for (auto &I: BB) { + if (isa(&I)) { + hasAllocas = true; + } + } + + if (isa(BB.getTerminator())) { + // For returning blocks find all candidates. + if (appendAllRecursiveCallsInBB(F, BB, CandidatesNumOf, Candidates, LastCandidate)) { + BBwithCandidates++; + } + } + else if (BranchInst *BI = dyn_cast(BB.getTerminator())) { + // For non-returning blocks find all candidates, iff it has a single + // successor which is a return only block. + if (BI->isUnconditional() && isReturnOnlyBB(*BI->getSuccessor(0))) { + if (appendAllRecursiveCallsInBB(F, BB, CandidatesNumOf, Candidates, LastCandidate)) { + BBwithCandidates++; + } + } + } + + // Currently only one BB with tail recursive calls is allowed. + // + // TODO (improvement): There might be interesting cases with recursive + // calls in more then one BB. + if (BBwithCandidates > 1) { + ORE.emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "Recursion elimination missed", + F.getEntryBlock().front().getDebugLoc(), &F.getEntryBlock()) + << "Multiple Basic Blocks"; + }); + return false; + } + } + + // No RSE required. Leave to TRE. + if (CandidatesNumOf < 2) { + return false; + } + + if (hasAllocas) { + ORE.emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "Recursion elimination missed", + F.getEntryBlock().front().getDebugLoc(), &F.getEntryBlock()) + << "Functions containing alloca instructions not yet supported"; + }); + return false; + } + + BasicBlock *BB = Candidates.front().getCallInst()->getParent(); + Instruction *TI = BB->getTerminator(); + + // All instructions between the first candidate and the last candidate are + // partitioned as static (unmovable) instructions and instructions (movable) + // The resulting two arrays are ordered in the required execution sequence. + // + // This means that the execution order (MovableInst | StaticInst) is legal + // and equivalent to the original sequence. + // + // Algorithm: + // + // * Iterate over all instructions between Candidate[n] and Candidate[n+1]. + // + // * Test if the instruction can be moved above all identified static + // instructions. + // + // * If it can, add it to the movable instruction list, otherwise add it to + // the static instruction list. + // + // * Consecutively add static instructions if necessary. + + // Iterate over all instruction the interval + // )Candidate[x]. Candidate[x+1]). Virtually include the terminator as + // additional candidate. + // + // Setup the first iteration interval between Candidate[0] and Candidate[1]. + // Note: The list has at least two members. + auto Cand0 = Candidates.begin(); + auto Cand1 = next(Cand0); + BasicBlock::iterator BBI(Cand0->getCallInst()); + auto CandI = Cand1; + + // The first candidate is always a single call which is a static instruction. + StaticInst.push_back(&*BBI); + + while (true) { + // Identity the instruction that marks the end of the current pair. + // + // It is either the CandI, or the terminator instruction (which is virtually added) + Instruction *E; + if (CandI != Candidates.end()) { + E = CandI->getCallInst(); + } + else { + E = TI; + } + + // advance the iterator first, because we start with the first call + // instruction which is already added. + ++BBI; + + // If we reached the terminator we are done. + if (&*BBI == TI) + break; + + // The current search intervals end candidate is reached? + if (&*BBI == E) { + StaticInst.push_back(&*BBI); + // Advance the current iteration interval to next candidate. + ++CandI; + continue; + } + + // Test if the instruction can be moved and add it accordingly. + bool movable = true; + for (auto &SI: StaticInst) { + if (canMoveAbove(&*BBI, &*SI)) + continue; + movable = false; + break; + } + + if (movable) + MovableInst.push_back(&*BBI); + else + StaticInst.push_back(&*BBI); + } + + // Assign all static instruction to the candidates. We do it naively by just + // assigning the instruction in between to candidates to second one. + auto SI = StaticInst.begin(); + for (auto &Cand: Candidates) { + while (*SI != Cand.getCallInst()) { + Cand.addInstruction(*SI); + SI++; + } + // Skip candidate's call inst. + SI++; + } + + // If not all static instructions have been assigned, then there are unmovable tailing + // instructions and the transformation is abandoned. + if (SI != StaticInst.end()) { + ORE.emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "Recursion elimination missed", *SI) + << "Instructions after last call."; + }); + return false; + } + + // Analyze each call candidate if the call and the corresponding static + // instructions are suitable to be eliminated. + for (auto &Cand: Candidates) { + if (!Cand.analyze()) { + ORE.emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "Recursion elimination missed", Cand.getCallInst()) + << "RecursiveCall nonviable"; + }); + return false; + } + } + + // Test if the functions generated during analysis (step before) behave equivalently. + for (auto CCI = std::next(Cand1), CCE = Candidates.end(); + CCI != CCE; ++CCI) { + + // Test if the function H(...) of both candidates match. + if (!CCI->compareFunctionH(&*Cand1)) { + ORE.emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "Recursion elimination missed", CCI->getCallInst()) + << "RecursiveCall nonviable (Function H do not match)"; + }); + return false; + } + } + + // Analyze the operands for each function H and categorize them into + // Stacked, Constant and Static. + IncomingArguments.resize(Cand1->getIncomingOperands().size(), IncomingArgument()); + for (auto CCI = Cand1, CCE = Candidates.end(); CCI != CCE; ++CCI) { + assert(IncomingArguments.size() == CCI->getIncomingOperands().size() && "Operand size mismatch"); + + auto IAI = IncomingArguments.begin(); + for (Use *U: CCI->getIncomingOperands()) { + IAI->addUse(&*CCI, U); + + // If the value is an Argument we need to check the position in the + // first candidate too. It is sufficient here to just execute the checks. + if (isa(U->get())) { + IAI->addUse(&*Cand0, U, true); + } + ++IAI; + } + } + + // Count the categories. + int NumStaticArguments = 0; + int NumConstantArguments = 0; + for (auto &IA: IncomingArguments) { + if (IA.isConstant()) + NumConstantArguments++; + if (IA.isStatic()) + NumStaticArguments++; + } + + // Estimate costs and gain. + if (IncomingArguments.size() - NumStaticArguments - NumConstantArguments + 2 > F.arg_size()) { + LLVM_DEBUG(dbgs() << "Will not apply - too costly\n"); + ORE.emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "Recursion elimination missed", Cand0->getCallInst()) + << "Too costly"; + }); + return false; + } + + OtherCandidatesNumOf = CandidatesNumOf - 1; + + if (DisableStackCleanup) { + Marker.reset(new FalseMarker(C, OtherCandidatesNumOf)); + } + else { + if (OtherCandidatesNumOf == 1) { + Marker.reset(new TrueMarker(C, OtherCandidatesNumOf)); + } + else { + Marker.reset(new CompareMarker(C, OtherCandidatesNumOf)); + //Marker.reset(new FieldMarker(C, OtherCandidatesNumOf)); + } + } + + ListTy = createListType(); + if (!Marker->checkListType(ListTy)) { + return false; + } + + // Splice the list into the two output lists. One contains the first element + // only, the other containing the remaining elements. + OtherCandidates.splice_after(OtherCandidates.before_begin(), Candidates, Candidates.begin(), Candidates.end()); + FirstCandidate.splice_after(FirstCandidate.before_begin(), Candidates); + + return true; + } + + /// Creates the worklist structure used to model the pending call list. + StructType *createListType() const { + // Collect the types of the operands + SmallVector ListElements; + for (auto &IA: IncomingArguments) { + if (IA.isStacked()) + ListElements.push_back(IA.getType()); + } + + // create a new StructType representing the list. + StructType *ListTy = StructType::create(C, "rse.listty"); + StructType *OperandsTy = StructType::get(C, makeArrayRef(ListElements), false); + + SmallVector LinkItems; + LinkItems.push_back(ListTy->getPointerTo()); + Marker->adjustStructFields(LinkItems); + StructType *LinkTy = StructType::get(C, makeArrayRef(LinkItems), false); + + Type *ListItemTys[2] = { OperandsTy, LinkTy }; + ListTy->setBody(makeArrayRef(ListItemTys, 2), false); + + return ListTy; + } + + /// Prepare the entry block by splitting of the loop entry. The following + /// PHINodes are added to the loop entry. + /// + /// * One for each argument (Also the uses are adjusted). + /// * List head representing the root of the worklist. + /// * If enabled the root to the free list. + template + std::tuple + prepareEntry(ArgumentPHIContainer &ArgumentPHIs) { + const DebugLoc &DebugLoc = FirstCandidate.front().getCallInst()->getDebugLoc(); + + // Split the entry block at the first viable instruction. + BasicBlock *Entry = &F.getEntryBlock(); + BasicBlock *LoopEntry = Entry->splitBasicBlock(Entry->getFirstNonPHI(), "rse.loop"); + Entry->getTerminator()->setDebugLoc(DebugLoc); + + // Setup List PHI nodes for the loop entry. + Instruction *InsertPos = &LoopEntry->front(); + PHINode *ListHead = PHINode::Create(ListTy->getPointerTo(), 2, + "rse.list", + InsertPos); + ListHead->addIncoming(ConstantPointerNull::get(ListTy->getPointerTo()), Entry); + + /// Setup the FreeList if necessary. + PHINode *FreeList = nullptr; + FreeList = PHINode::Create(ListTy->getPointerTo(), 2, + "rse.freelist", + InsertPos); + FreeList->addIncoming(ConstantPointerNull::get(ListTy->getPointerTo()), Entry); + + // Setup Argument PHI nodes for the loop entry - see TRE. + for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); + I != E; ++I) { + PHINode *PN = PHINode::Create(I->getType(), 2, + "res." + I->getName(), InsertPos); + I->replaceAllUsesWith(PN); // Everyone use the PHI node now! + PN->addIncoming(&*I, Entry); + ArgumentPHIs.push_back(PN); + } + + return std::make_tuple(ListHead, FreeList, LoopEntry); + } + + /// Adjust all ArgumentPHIs according to the CallInst to be eliminated. + /// + /// Arguments are taken from CI. IncomingBB is used as the BB to add into to + /// PHI list. + template + void adjustPHIsFromCallInst( + BasicBlock *IncomingBB, CallInst *CI, + const ArgumentPHIContainer &ArgumentPHIs) { + // Adjust PHI Nodes in the loop entry + CallInst::op_iterator Arg = CI->arg_begin(); + for (auto &PHI: ArgumentPHIs) { + assert(Arg != CI->arg_end() && "Callsite does not provide enough arguments."); + PHI->addIncoming(Arg->get(), IncomingBB); + Arg++; + } + assert(Arg == CI->arg_end() && "Callsite provided too many arguments."); + } + + /// Eliminates the first call just like it is done within tail call + /// elimination. Additionally it moves all movable instructions before the + /// first call candidate, and it splits the remaining calls into a separate + /// basic block. ArgumentPHIs are adjusted accordingly. + // + // calling.bb: + // ... + // call @recursive ... + // + // %movable1 = add ... + // %static1 = call @fn1, %movable1 + // call @recursive ... + // + // %movable2 = add ... + // %static2 = call @fn1, %movable2 + // call @recursive ... + // + // br ret + // + // -> + // + // calling.bb: + // ... + // %movable1 = add ... + // %movable2 = add ... + // br %loopentry + // + // H1: + // + // %static1 = call @fn1, %movable1 + // call @recursive ... + // + // %static2 = call @fn1, %movable2 + // call @recursive ... + // + // br ret + template + std::tuple + eliminateFirstCall(RecursiveCall &CC, + BasicBlock *LoopEntry, + const ArgumentPHIContainer &ArgumentPHIs) { + CallInst *CI = CC.getCallInst(); + + // Move all moveable instruction before this call instruction. + for (auto &I: MovableInst) + I->moveBefore(CI); + + // Split off the remaining static (unmovable instructions). + BasicBlock *CallerBB = CI->getParent(); + BasicBlock *StaticInstBB = CallerBB->splitBasicBlock(CI, "rse.H1"); + BranchInst *BI = cast(CallerBB->getTerminator()); + + // Create the loop to eliminate the call the the first candidate. + BI->setSuccessor(0, LoopEntry); + BI->setDebugLoc(CI->getDebugLoc()); + + adjustPHIsFromCallInst(CallerBB, CI, ArgumentPHIs); + CC.eraseSourceCallInst(); + + return std::make_tuple(CallerBB, StaticInstBB); + } + + /// Adds code to bypass the alloca (inserting by eliminateOtherCalls) with + /// code to picking it from the freelist. + // + // calling.bb: + // ... + // %movable.0 = add ... + // %movable.k = add ... + // + // ; allocate memory on stack + // %itemptr = alloca %listty, 2 + // + // ; compute address if list items + // %gep.0 = getelementptr %listty, 0 + // ... + // %gep.k = getelementptr %listty, k + // + // ; setup and store list item [0]; set mark to false + // mark.0 = + // %nextptr.0 = getelementptr %listty, itemptr, 0, 1, 0 + // store %gep.1, *nextptr.0 + // ... + // mark.k = + // %nextptr.k = getelementptr %listty, itemptr, k, 1, 0 + // store %listhead, %nextptr.k + // + // ; setup and store operands entries + // %opreands.0 = getelementptr %listty, itemptr, 0, 0 + // %arg.0-0 = insertvalue mark_0, undef, %movable.0-0 + // store %arg.0-0, %operands.0 + // ... + // + // -> [AI = %itemptr, AllocaEnd = %nextptr.k, NewHeadPtr = %gep.0] + // + // calling.bb: + // ... + // %movable.0 = add ... + // %movable.k = add ... + // + // alloca: + // ; allocate memory on stack + // %itemptr = alloca %listty, 2 + // + // ; compute address if list items + // %gep.0 = getelementptr %listty, 0 + // ... + // %gep.k = getelementptr %listty, k + // + // ; setup and store list item [0]; set mark to false + // mark.0 = + // %nextptr.0 = getelementptr %listty, %itemptr, 0, 1, 0 + // store %gep.1, *nextptr.0 + // ... + // mark.k = + // + // allocfromfree: + // %gep.0.fromfree = getelementptr %listty, %freelist, ... + // %nextptr = getelementptr %listty, 0, #STRUCT_IDX_OFNEXT + // %next = load %nextptr + // br calling.bb.1 + // + // calling.bb.1: + // %newheadptrphi = phi %listty, [%gep.0, %allocaa], [%gep.0.fromfree.fromfree, %allocfromfree] + // %allocated = phi %listty, [%itemptr, %alloca], [%freelist, %allocfromfree] + // %newfreelist = phi %listty, [%freelist, %alloca], [%next, %allocfromfree] + // + // %nextptr.k = getelementptr %listty, %allocated, k, 1, 0 + // store %listhead, %nextptr.k + // + // ; setup and store operands entries + // %opreands.0 = getelementptr %listty, %allocated, 0, 0 + // %arg.0-0 = insertvalue mark_0, undef, %movable.0-0 + // store %arg.0-0, %operands.0 + // ... + Instruction *replaceAllocaWithFreeList(AllocaInst *AI, Instruction *AllocaEnd, Instruction *NewHeadPtr, PHINode *FreeList) { + // Never generate code to pull from free list if not necessary + if (!Marker->mayPullFromFreeList()) { + FreeList->addIncoming(FreeList, AI->getParent()); + return NewHeadPtr; + } + + const DebugLoc &DebugLoc = AI->getDebugLoc(); + BasicBlock *PreCallingBB = AI->getParent(); + BasicBlock::iterator InsertPos(AI); + + BasicBlock *AllocaBB = PreCallingBB->splitBasicBlock(InsertPos, "rse.alloca"); + + InsertPos = BasicBlock::iterator(AllocaEnd); + BasicBlock *CallingBB = AllocaBB->splitBasicBlock(InsertPos, PreCallingBB->getName() + ".1"); + + PHINode *NewHeadPHI = PHINode::Create(ListTy->getPointerTo(), 2, "res.newhead", CallingBB->getFirstNonPHI()); + NewHeadPHI->addIncoming(NewHeadPtr, AllocaBB); + + BasicBlock *PullBB = BasicBlock::Create(C, "rse.allocfromfree", &F, CallingBB); + + // compute of last next ptr + Value *IdxNext[] = { + ConstantInt::get(DL.getIntPtrType(C, DL.getAllocaAddrSpace()), Marker->getItemIndex(0)), + ConstantInt::get(IntegerType::get(C, 32), 1), + ConstantInt::get(IntegerType::get(C, 32), 0), + }; + Instruction *PulledItem0 = GetElementPtrInst::CreateInBounds(ListTy, + FreeList, + makeArrayRef(IdxNext, 1), + "rse.pulleditemgep.0", + PullBB); + PulledItem0->setDebugLoc(DebugLoc); + NewHeadPHI->addIncoming(PulledItem0, PullBB); + + IdxNext[0] = ConstantInt::get(DL.getIntPtrType(C, DL.getAllocaAddrSpace()), Marker->getItemIndex(OtherCandidatesNumOf - 1)); + Instruction *NewFreePtr = GetElementPtrInst::CreateInBounds(ListTy, + FreeList, + makeArrayRef(IdxNext, 3), + "rse.freelist.nextptr", + PullBB); + NewFreePtr->setDebugLoc(DebugLoc); + + LoadInst *NewFree = new LoadInst(NewFreePtr, + "rse.newfreelist", + false, + DL.getABITypeAlignment(ListTy->getPointerTo()), + PullBB); + NewFree->setDebugLoc(DebugLoc); + + BranchInst *BI = BranchInst::Create(CallingBB, PullBB); + BI->setDebugLoc(DebugLoc); + + InsertPos = BasicBlock::iterator(CallingBB->getFirstNonPHI()); + PHINode *AllocatedPHI = PHINode::Create(ListTy->getPointerTo(), 2, + "rse.allocated", + &*InsertPos); + AllocatedPHI->setDebugLoc(DebugLoc); + + // replace all remaining uses of AI with the new PHInode + for (auto i = AI->use_begin(), e = AI->use_end(); i != e; /* in loop */) { + Use &U = *(i++); + if (Instruction *I = dyn_cast(U.getUser())) { + if (I->getParent() == CallingBB) + U.set(AllocatedPHI); + } + } + + AllocatedPHI->addIncoming(AI, AllocaBB); + AllocatedPHI->addIncoming(FreeList, PullBB); + + PHINode *UpdatedFreeList = PHINode::Create(ListTy->getPointerTo(), 2, + "rse.freelist.next", + &*InsertPos); + UpdatedFreeList->setDebugLoc(DebugLoc); + UpdatedFreeList->addIncoming(FreeList, AllocaBB); + UpdatedFreeList->addIncoming(NewFree, PullBB); + FreeList->addIncoming(UpdatedFreeList, UpdatedFreeList->getParent()); + + InsertPos = BasicBlock::iterator(PreCallingBB->getTerminator()); + ICmpInst *CmpI = new ICmpInst(&*InsertPos, + ICmpInst::ICMP_EQ, + FreeList, + ConstantPointerNull::get(ListTy->getPointerTo()), + "rse.cmp_freelist_empty"); + CmpI->setDebugLoc(DebugLoc); + + BI = BranchInst::Create(AllocaBB, PullBB, CmpI); + BI->setDebugLoc(DebugLoc); + ReplaceInstWithInst(PreCallingBB->getTerminator(), BI); + + return NewHeadPHI; + } + + /// Eliminate other recursive calls by adding the structures to the itemlist. + // + // calling.bb: + // ... + // %movable.0 = add ... + // %movable.k = add ... + // br %loopentry + // + // -> + // + // calling.bb: + // ... + // %movable.0 = add ... + // %movable.k = add ... + // + // ; allocate memory on stack + // %itemptr = alloca %listty, 2 + // + // ; compute address if list items + // %gep_0 = getelementptr %listty, 0 + // ... + // %gep_k = getelementptr %listty, k + // + // ; setup and store list item [0]; set mark to false + // mark.0 = + // %nextptr.0 = getelementptr %listty, itemptr, 0, 1, 0 + // store %gep.1, *nextptr.0 + // ... + // mark.k = + // %nextptr.k = getelementptr %listty, itemptr, k, 1, 0 + // store %listhead, %nextptr.k + // + // ; setup and store operands entries + // %opreands.0 = getelementptr %listty, itemptr, 0, 0 + // %arg.0-0 = insertvalue mark_0, undef, %movable.0-0 + // store %arg.0-0, %operands.0 + // ... + // %opreands.k = getelementptr %listty, itemptr, k, 0 + // %arg.k-0 = insertvalue mark_1, undef, %movable.k-0 + // store %arg.k-0, %operands.k + // + // br %loopentry + // + // ; below a reference to the source instruction + // + // H1: + // + // %static.0 = call @fn1, %movable.0 + // call @recursive ... + // + // %static.k = call @fn1, %movable.k + // call @recursive ... + // + // br ret + void eliminateOtherCalls(CandidateContainer &OtherCandidates, + BasicBlock *CallerBB, + PHINode *ListHead, + PHINode *FreeList) { + Instruction *InsertPos = CallerBB->getTerminator(); + Type *OperandsTy = ListTy->getTypeAtIndex(0U); + + // Add an entry to the list for all candidates except the first. This to + // eliminates the calls to those candidates. + AllocaInst *AllocaPtr = new AllocaInst(ListTy, + DL.getAllocaAddrSpace(), + ConstantInt::get(DL.getIntPtrType(C, DL.getAllocaAddrSpace()), OtherCandidatesNumOf), + DL.getABITypeAlignment(ListTy), + "rse.itemptr", + InsertPos); + AllocaPtr->setDebugLoc(InsertPos->getDebugLoc()); + + SmallVector ItemGEPs; + ItemGEPs.reserve(OtherCandidatesNumOf); + // OperandsGEPs.reserve(OtherCandidatesNumOf); + // LinkGEPs.reserve(OtherCandidatesNumOf); + unsigned idx = 0; + for (auto &Cand: OtherCandidates) { + Value *Idx = ConstantInt::get(DL.getIntPtrType(C, DL.getAllocaAddrSpace()), Marker->getItemIndex(idx)); + Instruction *ItemGEP = GetElementPtrInst::CreateInBounds(ListTy, + AllocaPtr, + makeArrayRef(&Idx, 1), + "rse.itemgep." + Twine(idx), + InsertPos); + ItemGEP->setDebugLoc(Cand.getCallInst()->getDebugLoc()); + ItemGEPs.push_back(ItemGEP); + + idx++; + } + + // Fill list link structure with the necessary values. With a freelist + // available all values except the last nextptr are constant during this + // execution instance. The address values are all relative to the allocated + // address, and the last mark (if any) also do not need to change when + // pulling the element from the freelist. AllocaEnd marks the first + // instruction that must be executed when pulling from the freelist. + Instruction *AllocaEnd = nullptr; + idx = 0; + for (auto &Cand: OtherCandidates) { + const DebugLoc &DebugLoc = Cand.getCallInst()->getDebugLoc(); + + Value *Idx[3] = { + ConstantInt::get(DL.getIntPtrType(C, DL.getAllocaAddrSpace()), Marker->getItemIndex(idx)), + ConstantInt::get(IntegerType::get(C, 32), 1), + ConstantInt::get(IntegerType::get(C, 32), 0) }; + + Instruction *LinkPtrGEP = GetElementPtrInst::CreateInBounds(ListTy, + AllocaPtr, + makeArrayRef(Idx, 2), + "rse.linkptr." + Twine(idx), + InsertPos); + LinkPtrGEP->setDebugLoc(Cand.getCallInst()->getDebugLoc()); + Marker->addSetupMarkerInst(LinkPtrGEP, idx, InsertPos, DebugLoc, DL); + + Instruction *NextPtrGEP = GetElementPtrInst::CreateInBounds(ListTy, + AllocaPtr, + makeArrayRef(Idx, 3), + "rse.nextptr." + Twine(idx), + InsertPos); + NextPtrGEP->setDebugLoc(Cand.getCallInst()->getDebugLoc()); + + Value *NextVal; + if (idx < OtherCandidatesNumOf - 1) { + NextVal = ItemGEPs[idx+1]; + } + else { + NextVal = ListHead; + AllocaEnd = NextPtrGEP; + } + + StoreInst *SNPI = new StoreInst(NextVal, + NextPtrGEP, + false, + DL.getABITypeAlignment(NextVal->getType()), + InsertPos); + SNPI->setDebugLoc(DebugLoc); + + idx++; + } + + // Fill the operands part of the ListItems + idx = 0; + for (auto &Cand: OtherCandidates) { + const DebugLoc &DebugLoc = Cand.getCallInst()->getDebugLoc(); + + unsigned ValueIdx = 0; + Instruction *OperandsVal = nullptr; + for (auto &IA: IncomingArguments) { + if (!IA.isStacked()) + continue; + + Value *ValueToStore = IA.getValue(&Cand); + + OperandsVal = InsertValueInst::Create((OperandsVal ? cast(OperandsVal) : cast(UndefValue::get(OperandsTy))), + ValueToStore, + makeArrayRef(&ValueIdx, 1), + "rse.arg." + Twine(idx) + ".Op" + Twine(ValueIdx), + InsertPos); + OperandsVal->setDebugLoc(DebugLoc); + ValueIdx++; + } + + // OperandsVal might be null if no operands have to be stored in the list. + if (OperandsVal) { + // Create the GEP that points to the operand struct + Value *Idx[2] = { + ConstantInt::get(DL.getIntPtrType(C, DL.getAllocaAddrSpace()), Marker->getItemIndex(idx)), + ConstantInt::get(IntegerType::get(C, 32), 0) }; + Instruction *OperandsGEP = GetElementPtrInst::CreateInBounds(ListTy, + AllocaPtr, + makeArrayRef(Idx, 2), + "rse.operandsgep." + Twine(idx), + InsertPos); + OperandsGEP->setDebugLoc(Cand.getCallInst()->getDebugLoc()); + // OperandsGEPs.push_back(OperandsGEP); + + StoreInst *SOVI = new StoreInst(OperandsVal, + OperandsGEP, + false, + DL.getABITypeAlignment(OperandsTy), + InsertPos); + SOVI->setDebugLoc(DebugLoc); + } + + idx++; + } + + Instruction *NewHead = replaceAllocaWithFreeList(AllocaPtr, AllocaEnd, ItemGEPs[0], FreeList); + + // Redirect list head by, adjusting listhead PHI Node in the loop entry. + ListHead->addIncoming(NewHead, NewHead->getParent()); + } + + /// Create a new return block which pulls from the list. It loops until the + /// list is empty. It returns if and only if the list is empty. + /// + /// This function splits the terminator instructions from the basic block + /// currently containing the remaining static instructions. + // + // H1: + // + // %static1 = call @fn1, %movable1 + // call @recursive ... + // + // %static2 = call @fn1, %movable2 + // call @recursive ... + // + // br ret + // + // -> + // + // pull: + // %listempty = icmp %listhead, null + // br %listempty, %return, %H1 + // + // H1: + // %static1 = call @fn1, %movable1 + // call @recursive ... + // + // %static2 = call @fn1, %movable2 + // call @recursive ... + // + // br return + // + // return: + // br ret + // + std::tuple + createNewReturnBlock(BasicBlock *StaticInstBB, RecursiveCall *CC, PHINode *ListHead) { + CallInst *CI = CC->getCallInst(); + const DebugLoc &DebugLoc = CI->getDebugLoc(); + + // If the block is a non-returning blocks then the following block is the a + // return only block and the branch is an unconditional one. + if (BranchInst *BI = dyn_cast(StaticInstBB->getTerminator())) { + BasicBlock *RetOnlyBB = StaticInstBB->getSingleSuccessor(); + assert(RetOnlyBB && "Recursion has more than one successor"); + Instruction *NewTI = RetOnlyBB->getTerminator()->clone(); + NewTI->insertBefore(BI); + NewTI->setDebugLoc(BI->getDebugLoc()); + BI->eraseFromParent(); + } + + // Create the BB which pulls an item from the list. + BasicBlock *PullBB = BasicBlock::Create(C, "rse.pull", &F, StaticInstBB); + + // Split off the terminator instruction. This becomes the returning + // basic block. + BasicBlock *NewReturnBB = StaticInstBB->splitBasicBlock( + StaticInstBB->getTerminator(), "rse.return"); + NewReturnBB->getTerminator()->setDebugLoc(DebugLoc); + + ICmpInst *CmpI = new ICmpInst(*PullBB, + ICmpInst::ICMP_EQ, + ListHead, + ConstantPointerNull::get(ListTy->getPointerTo()), + "rse.cmp_list_empty"); + CmpI->setDebugLoc(DebugLoc); + + BranchInst *BI = BranchInst::Create(NewReturnBB, StaticInstBB, CmpI, PullBB); + BI->setDebugLoc(DebugLoc); + + return std::make_tuple(PullBB, StaticInstBB, NewReturnBB); + } + + /// Transforms the path which does not return on the old return such that it + /// pick the information from the itemlist, and loops back to the loopentry + /// after executing the relevant parts of the function H. + // + // H1: + // ; load data from list + // %listitem = load %listhead + // + // ; extract data from list (mark is omitted if necessary) + // %next = extractvalue %listitem, 1, 0 + // ... + // + // -> [instructions are inserted below %next] + // + // H1: + // ; load data from list + // %listitem = load %listhead + // + // ; extract data from list (mark is omitted if necessary) + // %next = extractvalue %listitem, 1, 0 + // %mark = + // br %mark, %addtofree, %H2 + // + // addtofree: + // %firstinblockptr = getelementptr %listty, %listhead, -1 + // %freenextptr = getelementptr %listty, %listhead, -1, 1, 0 + // store %freelist, %freenextptr + // br %H2 + // + // H2: + // %newfree - phi [%freelist, %H1], [%firstinblock, %addtofree] + // + // ... + // + void insertAddToFreeListBB(Value *ListItem, Instruction *NextPtr, PHINode *ListHead, PHINode *FreeList, + const DebugLoc &DebugLoc) { + BasicBlock *H1 = NextPtr->getParent(); + BasicBlock::iterator InsertPos(NextPtr); + ++InsertPos; + unsigned LinkItemIdx = 1; + Instruction *LinkItemVal = ExtractValueInst::Create( + ListItem, makeArrayRef(&LinkItemIdx, 1), "rse.linkval", &*InsertPos); + LinkItemVal->setDebugLoc(DebugLoc); + Value *Mark = Marker->addComputeMarkInst(LinkItemVal, ListHead, NextPtr, &*InsertPos, DebugLoc); + + if (ConstantInt *MarkerConst = dyn_cast(Mark)) { + // Nothing to do marker is constant false + if (MarkerConst->isZero()) { + // Adjust Freelist PHI + FreeList->addIncoming(FreeList, InsertPos->getParent()); + return; + } + } + + // Compute address of bulk allocation. + signed IdxOfFirstItem = -(signed)Marker->getItemIndex(OtherCandidatesNumOf - 1); + //signed IdxOfFirstItem = -(signed)getItemIndex(OtherCandidatesNumOf - 1); + Value *IdxBulk[] = { + ConstantInt::get(DL.getIntPtrType(C, DL.getAllocaAddrSpace()), IdxOfFirstItem), + }; + Instruction *NewFreeHead = GetElementPtrInst::CreateInBounds(ListTy, + ListHead, + makeArrayRef(IdxBulk, 1), + "rse.firstinblockptr", + &*InsertPos); + NewFreeHead->setDebugLoc(DebugLoc); + + // Adjust Freelist PHI + FreeList->addIncoming(NewFreeHead, InsertPos->getParent()); + + // If freelist handling is conditional, surround the generating function by + // a basic block and execute it conditionally + if (Instruction *MarkerInst = dyn_cast(Mark)) { + // Split of a basic block and add a conditional branch instruction + BasicBlock *H2 = H1->splitBasicBlock(InsertPos, "rse.H2"); + BasicBlock *AddToFreeBB = H1->splitBasicBlock(NewFreeHead, "rse.addtofree"); + + // Insert a branch depending on mark. + BranchInst *BI = BranchInst::Create(AddToFreeBB, H2, Mark); + BI->setDebugLoc(DebugLoc); + ReplaceInstWithInst(H1->getTerminator(), BI); + + // Add a PHINode into H2 + InsertPos = BasicBlock::iterator(H2->getFirstNonPHI()); + PHINode *FreeNextPHI = PHINode::Create(ListTy->getPointerTo(), 2, + "rse.newfree.H", + &*InsertPos); + FreeNextPHI->setDebugLoc(DebugLoc); + NewFreeHead->replaceAllUsesWith(FreeNextPHI); + FreeNextPHI->addIncoming(FreeList, H1); + FreeNextPHI->addIncoming(NewFreeHead, AddToFreeBB); + } + + // Store the old list head into the last element of the item. This is done + // after potentially splitting of the conditional branch, to avoid messing + // up with `NewFreeHead->replaceAllUsesWith(FreeNextPHI);`, we want to use + // this the original value here. + InsertPos = std::next(BasicBlock::iterator(NewFreeHead)); + // Compute of last next ptr. + Value *IdxNext[] = { + ConstantInt::get(DL.getIntPtrType(C, DL.getAllocaAddrSpace()), Marker->getItemIndex(OtherCandidatesNumOf - 1)), + ConstantInt::get(IntegerType::get(C, 32), 1), + ConstantInt::get(IntegerType::get(C, 32), 0), + }; + Instruction *FreeNext = GetElementPtrInst::CreateInBounds(ListTy, + NewFreeHead, + makeArrayRef(IdxNext, 3), + "rse.freenextptr", + &*InsertPos); + FreeNext->setDebugLoc(DebugLoc); + + StoreInst *SI = new StoreInst(FreeList, + FreeNext, + false, + DL.getABITypeAlignment(ListTy->getPointerTo()), + &*InsertPos); + SI->setDebugLoc(DebugLoc); + } + + /// Transforms the path which does not return on the old return such that it + /// pick the information from the itemlist, and loops back to the loopentry + /// after executing the relevant parts of the function H. + // + // pull: + // %listempty = icmp %listhead, null + // br %listempty, %return, %H1 + // + // H1: + // %static1 = call @fn1, %movable1 + // call @recursive ... + // + // %static2 = call @fn1, %movable2 + // call @recursive ... + // + // br return + // + // -> + // + // pull: + // %listempty = icmp %listhead, null + // br %listempty, %return, %H1 + // + // H1: + // ; load data from list + // %listitem = load %listhead + // + // ; extract data from list (mark is omitted if necessary) + // %next = extractvalue %listitem, 1, 0 + // %mark = + // %operand.0 = extractvalue %listitem, 0, 0 + // + // ; execute function H but replace the recursive call by a branch into the loop entry + // %static1 = call @fn1, %operand.0 + // + // ; Note, %static1 is used to adjust the corresponding argument phi. + // br %loopentry + // + template + void transformHExecution( + CandidateContainer &OtherCandidates, + BasicBlock *HExecuteBB, + PHINode *ListHead, PHINode *FreeList, + BasicBlock *LoopEntry, + const ArgumentPHIContainer &ArgumentPHIs) { + + Instruction *InsertPos = HExecuteBB->getFirstNonPHIOrDbg(); + + // Split the old static instruction which will become unreachable. These are deleted later. + BasicBlock *StaticUnreachBB = HExecuteBB->splitBasicBlock(InsertPos, "rse.staticunreachable"); + InsertPos = HExecuteBB->getTerminator(); + + RecursiveCall &CC = OtherCandidates.front(); + const DebugLoc &DebugLoc = CC.getCallInst()->getDebugLoc(); + + // Load the list head content. + LoadInst *ListItem = new LoadInst(ListHead, + "rse.listitem", + false, + DL.getABITypeAlignment(ListTy), + InsertPos); + ListItem->setDebugLoc(DebugLoc); + + // Extract the elements of the listitem to prepare the operands. + unsigned Idx[2] = { 1, 0 }; + + ExtractValueInst *nextPtr = ExtractValueInst::Create( + ListItem, makeArrayRef(Idx, 2), "rse.next", InsertPos); + nextPtr->setDebugLoc(DebugLoc); + + unsigned IncomingCount = IncomingArguments.size(); + SmallVector Incoming; + Incoming.reserve(IncomingCount); + Idx[0] = 0; + + Idx[1] = 0; + for (IncomingArgument &IA : IncomingArguments) { + Value *V = IA.getValue(); + if (IA.isConstant() || IA.isStatic()) { + Incoming.push_back(V); + } + else { + Value *EVI = ExtractValueInst::Create( + ListItem, makeArrayRef(Idx, 2), "rse.operand." + Twine(Idx[1]), InsertPos); + Incoming.push_back(EVI); + Idx[1]++; + } + } + + // Clone instructions from function H back into the code sequence. + Function *FunctionH = CC.getFunctionH(); + assert(FunctionH && "FunctionH not present."); + + CallInst *CI = nullptr; + std::map VVMap; + for (Instruction &I: FunctionH->getEntryBlock()) { + if (&I == FunctionH->getEntryBlock().getTerminator()) + continue; + + Instruction *NewI = I.clone(); + VVMap[&I] = NewI; + NewI->insertBefore(InsertPos); + CI = dyn_cast(NewI); + + for (Use &U: NewI->operands()) { + Value *V = U.get(); + + auto VVMI = VVMap.find(V); + + // For Argument we replace the operand with the corresponding item in Incoming. + if (Argument *A = dyn_cast(V)) { + U.set(Incoming[A->getArgNo()]); + } + // For Values within the Value-Value map use VVMap to resolve the relation. + else if (VVMI != VVMap.end()) { + U.set(VVMI->second); + } + // Otherwise it must be a constant that can be left unchanged. + else { + assert(isa(V) && "could not transform. Not a constant"); + } + } + } + + assert(CI && CI->getCalledFunction() == &F && "Last instruction in FunctionH was not a recursive CallInst"); + + // Redirect Terminator. + BranchInst *BI = dyn_cast(HExecuteBB->getTerminator()); + assert(BI && BI->isUnconditional() && "Invalid Terminator in stackrecurse."); + BI->setSuccessor(0, LoopEntry); + + // Adjust the List PHI Node in the loop entry. + ListHead->addIncoming(nextPtr, HExecuteBB); + + // Adjust Argument PHI Nodes in the loop entry. + adjustPHIsFromCallInst(HExecuteBB, CI, ArgumentPHIs); + + // Finally erase the CallInst. + CI->eraseFromParent(); + + // Insert Code for stack cleanup. + insertAddToFreeListBB(ListItem, nextPtr, ListHead, FreeList, DebugLoc); + + // Remove the instructions which became unreachable. + for (auto &Cand: OtherCandidates) { + Cand.eraseSourceInstructions(); + } + + assert(&StaticUnreachBB->front() == StaticUnreachBB->getTerminator() && "Not all instructions have been removed"); + + StaticUnreachBB->eraseFromParent(); + } + + /// Redirect all returning blocks into the the new return block. + void redirectRemainingReturnBlocks(BasicBlock *PullBB, BasicBlock *NewReturnBB) { + for (auto &BB: F) { + if (&BB == NewReturnBB) + continue; + + if (ReturnInst *RI = dyn_cast(BB.getTerminator())) { + BranchInst *BI = BranchInst::Create(PullBB, RI); + BI->setDebugLoc(RI->getDebugLoc()); + RI->eraseFromParent(); + } + } + } + + /// Call the transformation functions above to apply the transformation. + void eliminateRecursion() { + SmallVector ArgumentPHIs; + PHINode *ListHead; + PHINode *FreeList; + BasicBlock *LoopEntry; + + std::tie(ListHead, FreeList, LoopEntry) = prepareEntry(ArgumentPHIs); + + BasicBlock *CallerBB, *StaticInstBB; + std::tie(CallerBB, StaticInstBB) = + eliminateFirstCall(FirstCandidate.front(), LoopEntry, ArgumentPHIs); + + eliminateOtherCalls( + OtherCandidates, + CallerBB, + ListHead, + FreeList); + + BasicBlock *PullBB; + BasicBlock *HExecuteBB; + BasicBlock *NewReturnBB; + std::tie(PullBB, HExecuteBB, NewReturnBB) = + createNewReturnBlock(StaticInstBB, &OtherCandidates.front(), ListHead); + + transformHExecution( + OtherCandidates, + HExecuteBB, ListHead, FreeList, LoopEntry, ArgumentPHIs); + + redirectRemainingReturnBlocks(PullBB, NewReturnBB); + + for (PHINode *PN : ArgumentPHIs) { + // If the PHI Node is a dynamic constant, replace it with the value it is. + if (Value *PNV = SimplifyInstruction(PN, F.getParent()->getDataLayout())) { + PN->replaceAllUsesWith(PNV); + PN->eraseFromParent(); + } + } + + if (Value *PNV = SimplifyInstruction(FreeList, F.getParent()->getDataLayout())) { + FreeList->replaceAllUsesWith(PNV); + FreeList->eraseFromParent(); + } + + ORE.emit([&]() { + return OptimizationRemark(DEBUG_TYPE, "Recursion eliminated", + F.getEntryBlock().front().getDebugLoc(), &F.getEntryBlock()) + << "Stack modeled with: " << ore::NV("Width", OtherCandidatesNumOf + 1); + }); + } +}; + +/// Wrapper class for legacy pass manager. +struct RecursionStackElimination : public FunctionPass { + static char ID; + RecursionStackElimination() : FunctionPass(ID) { + initializeRecursionStackEliminationPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + AU.addPreserved(); + } + + bool runOnFunction(Function &F) override { + if (DisableRecursionStackElimination) + return false; + + if (skipFunction(F)) + return false; + + return RecursionStackEliminationImpl( + F, + getAnalysis().getAAResults(), + getAnalysis().getORE()).run(); + } +}; +} + +char RecursionStackElimination::ID = 0; +INITIALIZE_PASS_BEGIN(RecursionStackElimination, DEBUG_TYPE, "Recursion Stack Elimination", + false, false) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) +INITIALIZE_PASS_END(RecursionStackElimination, DEBUG_TYPE, "Recursion Stack Elimination", + false, false) + +/// Public interface to the RecursionStackEliminationination pass. +FunctionPass *llvm::createRecursionStackEliminationPass() { + return new RecursionStackElimination(); +} + +/// Wrapper class for experimental pass manager. +PreservedAnalyses RecursionStackEliminationPass::run(Function &F, + FunctionAnalysisManager &AM) { + if (DisableRecursionStackElimination) + return PreservedAnalyses::all(); + + bool Changed = RecursionStackEliminationImpl( + F, + AM.getResult(F), + AM.getResult(F)).run(); + + if (!Changed) + return PreservedAnalyses::all(); + + PreservedAnalyses PA; + PA.preserve(); + + return PA; +} Index: lib/Transforms/Scalar/Scalar.cpp =================================================================== --- lib/Transforms/Scalar/Scalar.cpp +++ lib/Transforms/Scalar/Scalar.cpp @@ -93,6 +93,7 @@ initializeSimpleLoopUnswitchLegacyPassPass(Registry); initializeSinkingLegacyPassPass(Registry); initializeTailCallElimPass(Registry); + initializeRecursionStackEliminationPass(Registry); initializeSeparateConstOffsetFromGEPPass(Registry); initializeSpeculativeExecutionLegacyPassPass(Registry); initializeStraightLineStrengthReducePass(Registry); @@ -236,6 +237,10 @@ unwrap(PM)->add(createTailCallEliminationPass()); } +void LLVMAddRecursionStackEliminationinationPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createRecursionStackEliminationPass()); +} + void LLVMAddConstantPropagationPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createConstantPropagationPass()); }