Index: include/llvm-c/Transforms/Scalar.h
===================================================================
--- include/llvm-c/Transforms/Scalar.h
+++ include/llvm-c/Transforms/Scalar.h
@@ -123,6 +123,9 @@
 /** See llvm::createTailCallEliminationPass function. */
 void LLVMAddTailCallEliminationPass(LLVMPassManagerRef PM);
 
+/** See llvm::createRecursionStackEliminationinationPass function. */
+void LLVMAddRecursionStackEliminationinationPass(LLVMPassManagerRef PM);
+
 /** See llvm::createConstantPropagationPass function. */
 void LLVMAddConstantPropagationPass(LLVMPassManagerRef PM);
 
Index: include/llvm/InitializePasses.h
===================================================================
--- include/llvm/InitializePasses.h
+++ include/llvm/InitializePasses.h
@@ -382,6 +382,7 @@
 void initializeStripSymbolsPass(PassRegistry&);
 void initializeStructurizeCFGPass(PassRegistry&);
 void initializeTailCallElimPass(PassRegistry&);
+void initializeRecursionStackEliminationPass(PassRegistry&);
 void initializeTailDuplicatePass(PassRegistry&);
 void initializeTargetLibraryInfoWrapperPassPass(PassRegistry&);
 void initializeTargetPassConfigPass(PassRegistry&);
Index: include/llvm/Transforms/Scalar.h
===================================================================
--- include/llvm/Transforms/Scalar.h
+++ include/llvm/Transforms/Scalar.h
@@ -280,6 +280,14 @@
 
 //===----------------------------------------------------------------------===//
 //
+// RecursionStackEliminationination - This pass eliminates 2 or more call
+// instructions to the current function which occur immediately before return
+// instructions, by explizitly modelling the a queue
+//
+FunctionPass *createRecursionStackEliminationPass();
+
+//===----------------------------------------------------------------------===//
+//
 // EarlyCSE - This pass performs a simple and fast CSE pass over the dominator
 // tree.
 //
Index: include/llvm/Transforms/Scalar/RecursionStackElimination.h
===================================================================
--- /dev/null
+++ include/llvm/Transforms/Scalar/RecursionStackElimination.h
@@ -0,0 +1,27 @@
+//===---- TODO ----------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_SCALAR_RECURSIONSTACKELIMINATION_H
+#define LLVM_TRANSFORMS_SCALAR_RECURSIONSTACKELIMINATION_H
+
+#include "llvm/IR/Function.h"
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+struct RecursionStackEliminationPass : PassInfoMixin<RecursionStackEliminationPass> {
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+}
+
+#endif // LLVM_TRANSFORMS_SCALAR_RECURSIONSTACKELIMINATION_H
Index: lib/Passes/PassBuilder.cpp
===================================================================
--- lib/Passes/PassBuilder.cpp
+++ lib/Passes/PassBuilder.cpp
@@ -137,6 +137,7 @@
 #include "llvm/Transforms/Scalar/NewGVN.h"
 #include "llvm/Transforms/Scalar/PartiallyInlineLibCalls.h"
 #include "llvm/Transforms/Scalar/Reassociate.h"
+#include "llvm/Transforms/Scalar/RecursionStackElimination.h"
 #include "llvm/Transforms/Scalar/RewriteStatepointsForGC.h"
 #include "llvm/Transforms/Scalar/SCCP.h"
 #include "llvm/Transforms/Scalar/SROA.h"
@@ -358,6 +359,12 @@
   assert(Level != O0 && "Must request optimizations!");
   FunctionPassManager FPM(DebugLogging);
 
+  FPM.addPass(SROA());                // Try to get rid of allocas
+  FPM.addPass(SimplifyCFGPass());     // Merge & remove BBs
+  FPM.addPass(RecursionStackEliminationPass());
+  FPM.addPass(ADCEPass());            // Delete dead instructions
+  FPM.addPass(SimplifyCFGPass());     // Merge & remove BBs
+
   // Form SSA out of local memory accesses after breaking apart aggregates into
   // scalars.
   FPM.addPass(SROA());
Index: lib/Transforms/IPO/PassManagerBuilder.cpp
===================================================================
--- lib/Transforms/IPO/PassManagerBuilder.cpp
+++ lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -546,6 +546,12 @@
     MPM.add(createArgumentPromotionPass()); // Scalarize uninlined fn args
 
   addExtensionsToPM(EP_CGSCCOptimizerLate, MPM);
+
+  MPM.add(createSROAPass());
+  MPM.add(createCFGSimplificationPass());     // Merge & remove BBs
+  MPM.add(createRecursionStackEliminationPass());
+  MPM.add(createAggressiveDCEPass());         // Delete dead instructions
+
   addFunctionSimplificationPasses(MPM);
 
   // FIXME: This is a HACK! The inliner pass above implicitly creates a CGSCC
Index: lib/Transforms/Scalar/CMakeLists.txt
===================================================================
--- lib/Transforms/Scalar/CMakeLists.txt
+++ lib/Transforms/Scalar/CMakeLists.txt
@@ -55,6 +55,7 @@
   Reassociate.cpp
   Reg2Mem.cpp
   RewriteStatepointsForGC.cpp
+  RecursionStackElimination.cpp
   SCCP.cpp
   SROA.cpp
   Scalar.cpp
Index: lib/Transforms/Scalar/RecursionStackElimination.cpp
===================================================================
--- /dev/null
+++ lib/Transforms/Scalar/RecursionStackElimination.cpp
@@ -0,0 +1,2158 @@
+//===- RecursionStackElimination.cpp - Eliminate multiple tail recursions -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass converts multiple tail recursive calls into a loop, by modeling the
+// calls as a single linked worklist explicitly on the stack.
+//
+// void f(a, b, const c) {
+//   ...
+//   if (...)
+//     return
+//
+//   a1, b1, b2, b3, x2, x3 = ...
+//
+//   f(a1, b1, c) // 1st recursion
+//   a2 = h(x2)
+//   f(a2, b2, c) // 2nd recursion
+//   a3 = h(x3)
+//   f(a3, b3, c) // 3rd recursion
+// }
+//
+// ->
+//
+// void f(a, b, const c) {
+//   struct { x, b, next } *worklist = null
+//
+// loop:
+//   ...
+//   if (...)
+//     goto check_return
+//
+//   a1, b1, b2, b3, x2, x3 = ...
+//
+//   /* Assign arguments for the first call */
+//
+//   a = a1
+//   b = b1
+//
+//   /* Put arguments of the remaining calls into the work list */
+//
+//   queue = alloca(2 * sizeof(*worklist))
+//   queue[0] = {x2, b2, &queue[1]}
+//   queue[1] = {x3, b3, worklist}
+//   worklist = queue
+//
+//   goto loop
+//
+// check_return:
+//   if (!worklist)
+//     return
+//   a = h(worklist->x)
+//   b = worklist->b
+//   worklist = worklist->next
+//
+//   goto loop
+// }
+//
+// Such patterns occur for example when an application traverses k-ary full trees.
+//
+// The benefits of this transformation is, that neither frame nor link address
+// have to be stored on the stack. Also pass through arguments, like 'const c'
+// in example above, are less likely required to saved onto the stack, and if so
+// less often (only once in the entry block).
+//
+// The downsides are:
+//
+// a) An additional store for the worklist is required.
+//
+// b) The worst case required stack memory after this transformation depends on
+//    the number of recursive calls, instead of the recursion depth.
+//
+// c) Additional conditional branches
+//
+// ad a) This point is compensated by avoiding storing the link address in each
+//   call.
+//
+// ad b) This pass additionally can add (and does it by default) code to manage
+//   unused blocks of allocas in a freelist. Because allocations are done in
+//   blocks the freelist management, can also be done in blocks. This requires
+//   the last item in each block to be marked, detectable by the code within
+//   return. Currently the following mark algorithms are implemented:
+//
+//   A further variable is stored on the stack containing the current
+//   freelist. On the other hand by not executing a recursion the frame pointer
+//   loads and stores are omitted.
+//
+//   * TrueMarker: This algorithm marks each element and is applicable to binary
+//     recursions.
+//
+//   * FalseMarker: This algorithm marks no element, and is applicable when
+//     freelist management is disabled.
+//
+//   * FieldMarker: A separate bit is allocated in the worklist. This marker
+//     requires additional instructions but can be used in the general case.
+//
+//   * CompareMarker: Assuming allocas return memory addresses in a strictly
+//     monotonic order. The freelist can be modeled to return the same order
+//     when pulling elements from it. Comparing each worklist with worklist.next
+//     can then reveal the information if the element is marked.
+//
+//   * TaggedMarker: (not yet implemented) Similar to the FieldMarker this
+//     marker marks the last item by a bit. But instead of using a separate bit
+//     it uses Bit 0 of the worklist field. If the alignment of the worklist is
+//     a power of 2, and if it is >= 2, this marker can also cover the general
+//     case. It requires some additional bit masking but no additional memory
+//     operations.
+//
+// ad c) The pass adds 1 conditional branch into the return path and 2
+//   additional branches for freelist management (see (b) above). Depending on
+//   the target, machine branch prediction can elevate this.
+//
+// Algorithm outline:
+// ------------------
+//
+// Analysis Phase:
+//
+// 1) Analyze the function and gather the returning basic blocks (BB) (and BB
+//    branching to return-only BB) with recursive calls (RC).
+//
+// 2) If more the one BB or no RC is found abandon the transformation.
+//
+// 3) If the remaining BB has only one RC abandon the transformation. Otherwise
+//    let N be number of RC in this BB.
+//
+// 4) Analyze the instructions in this BB and from the first RC until the
+//    terminator instruction and classify each instruction as movable and static.
+//
+//    A movable instruction is and instruction that can be safely moved before
+//    the first RC. All other instructions are classified static.
+//
+// 5) Assign each static instruction to the following RC instruction. If static
+//    instructions are left after the last RC abandon the transformation.
+//
+// 6) Build the function H with all its arguments for each RC. By including the
+//    call itself in function H it is ensured that this function and enforcing
+//    this function to return void. It is ensured that there are no escaping
+//    values uses after the recursion.
+//
+// 6.1) Note: By the way step 4 is executed it is guaranteed that function H for
+//   the first RC consists of a single instruction; the call itself. The first
+//   call candidate is handled special (the same way as in Tail Recursion
+//   Elimination (TRE)).
+//
+// 5,6) Note: The information collected on each RC is collected in the structure
+//   RecursiveCall.
+//
+// 7) Compare the second's function H with all later ones. The behavior must
+//    match, otherwise abandon the transformation.
+//
+// 7.1) Note: As the first RCs function H basically a TRE it can be ignored in this
+//    step.
+//
+// 8) Simplify the argument list by removing constants and pass through arguments.
+//
+// 9) Decide whether it is profitable to use the transformation.
+//
+// Transformation Phase:
+//
+// 1) Adjust entry block and split of the loop entry.
+//
+// 2) Eliminate the first RC (similar to TRE).
+//
+// 3) Eliminate the remaining RC by allocating and filling an array new (or pick
+//    it from the freelist) block of N-1 struct items. This array is put in the
+//    front of the list. Pulling the list is added in (4). The execution of
+//    function H is ensured in (5).
+//
+// 4) Create a new return block which pulls items from the worklist. If an and
+//    of block marker is reached. The block is put into the freelist.
+//
+// 5) Add the instruction from function H into the return block and create the
+//    loop.
+//
+// 6) Redirect each returning block into the new return block created in (4).
+//
+// 7) Drop the constant STACKGROWTHDIRECION. It is manly uses as a proof of
+//    concept for Aarch64.
+//
+// Open issues and known TODOs - It would be great if reviewer could comment on
+// those as well:
+//
+// 1) Pipeline integration: Currently the pass is put before TRE. This includes
+//    some supporting passes, for cleanup and preparation. This cannot be left
+//    as is. The preferred way could be by adjusting
+//    "AArch64TargetMachine::adjustPassManager" and only use it with
+//    O3. AAarch64 is selected, because this is the architecture the author has
+//    suitable benchmarking and test setups available. This was tried once (in a
+//    similar way as in AMDGPUTargetMachine::adjustPassManager), however the
+//    result was that may LLVM projects did not compile anymore because of
+//    linker problems (Passes library was missing). Do you have any advise here?
+//
+// 2) The way to test if it profitable to use the pass needs adjustment. I think
+//    that functions, that spill more registers have an increased chance to
+//    profit, while functions that spill less, have lower chance for profit.
+//
+// 3) Thinking of a configurable way (maybe a separate marker class) to adjust
+//    the way markers implemented. E.g.: putting the marker bit into the pointer
+//    on Aarch64 show a significant performance boost (test implemented in C).
+//
+// 4) Is it safe to temporary create function and return no-changes after
+//    deleting it? If not is there a better way to than calling using the
+//    FunctionComparator?
+//
+// 5) GlobalsAA needs to preserved. Not sure about this in this context. Loads
+//    and Stores are added here.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/RecursionStackElimination.h"
+#include "llvm/Pass.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/FunctionComparator.h"
+
+#include <cassert>
+#include <forward_list>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "recstackelim"
+
+namespace {
+// TODO (correctness) fetch constant from TargetInfo. However this is a CodeGen
+// Information and not yet available. How to Solve?
+const TargetFrameLowering::StackDirection STACKDIRECTION = TargetFrameLowering::StackGrowsDown;
+
+
+/// Option to disable the pass.
+static cl::opt<bool>
+DisableRecursionStackElimination("disable-" DEBUG_TYPE, cl::init(false),
+                                 cl::Hidden, cl::desc("Disable stack recursion elimination"));
+
+/// Option to disable freelist handling.
+static cl::opt<bool>
+DisableStackCleanup("disable-" DEBUG_TYPE "-stack-cleanup", cl::init(false),
+                    cl::Hidden, cl::desc("Disable stack recursion elimination"));
+
+/// Local implementation class.
+struct RecursionStackEliminationImpl {
+  RecursionStackEliminationImpl(Function &F, AliasAnalysis &AA, OptimizationRemarkEmitter &ORE) :
+      DL(F.getParent()->getDataLayout()),
+      C(F.getParent()->getContext()),
+      ORE(ORE),
+      F(F),
+      AA(AA) {}
+
+  bool run() {
+    // Check if feasible and profitable,
+    if (analyzeFunction()) {
+      // then eliminate the recursion.
+      eliminateRecursion();
+      return true;
+    }
+
+    return false;
+  }
+
+private:
+  /// Helper class to collect information on a single recursive call site. This
+  /// class also stores the required properties of the function H (see
+  /// introduction's description of this pass).
+  struct RecursiveCall {
+  private:
+    /// Hide copy constructor
+    RecursiveCall(const RecursiveCall &O) : Index(O.Index), F(O.F), H(O.H), CI(O.CI), Is() {
+      llvm_unreachable("RecursiveCall cannot be copied");
+    }
+
+  public:
+    RecursiveCall(CallInst *CI, size_t Index) : Index(Index), F(*CI->getParent()->getParent()), H(nullptr), CI(CI), Is() {}
+
+    ~RecursiveCall() {
+      // If function H was already created, delete it. It is only used temporary.
+      if (H)
+        H->eraseFromParent();
+    }
+
+    /// Add an instruction to the list representing the function H.
+    void addInstruction(Instruction *I) {
+      Is.push_back(I);
+    }
+
+    /// Returns the recursive call instruction.
+    CallInst *getCallInst() const {
+      return CI;
+    }
+
+    /// Test if the value V is part of function H. Because H is not created these
+    /// instructions are denoted "local"
+    bool isLocalValue(Value *V, unsigned *Idx = nullptr) {
+      auto LocalInstructionIt = std::find(Is.begin(), Is.end(), V);
+
+      if (LocalInstructionIt == Is.end())
+        return false;
+
+      if (Idx)
+        *Idx = LocalInstructionIt - Is.begin();
+
+      return true;
+    }
+
+    /// Get the list operands of that are required for calling the function H
+    const std::vector<Use*> &getIncomingOperands() const {
+      return IncomingOperands;
+    }
+
+    /// If possible create the function H (including the final recursive call
+    /// instruction). If it succeeds buildFunctionH returns true and assigns it to
+    /// the member variable H. The created function is purely temporary. Its
+    /// lifetime is bound to this object. The created function always returns void.
+    bool buildFunctionH() {
+      if (H)
+        H->eraseFromParent();
+      H = nullptr;
+
+      // Get the arguments list from H
+      SmallVector<Type*, 4> HArgs;
+      for (auto &U: IncomingOperands)
+        HArgs.push_back(U->get()->getType());
+
+      // Get the function type and create it
+      FunctionType *HTy = FunctionType::get(Type::getVoidTy(F.getContext()), makeArrayRef(HArgs), F.isVarArg());
+      H = Function::Create(HTy, GlobalValue::PrivateLinkage, F.getAddressSpace(), F.getName() + ".H" + Twine(Index), F.getParent());
+      BasicBlock *NewBB = BasicBlock::Create(H->getContext(), "entry", H);
+
+      // Value-Value map to match instruction operands between the source and clone.
+      std::map<Value*, Value*> VVMap;
+
+      // Define a lambda used to clone an instruction. If leave constants is true
+      // those are taken from the argument list if possible.
+      auto cloneAndAppendInst = [this, &VVMap, NewBB] (Instruction *I, bool leaveConstants) {
+        // We do not allow any uses outside the newly created function. Otherwise
+        // the function would return some non void value.
+        for (Use &U : I->uses()) {
+          if (!isLocalValue(U.getUser()) && U.getUser() != CI) {
+            H->eraseFromParent();
+            H = nullptr;
+            return false;
+          }
+        }
+
+        // Clone the instruction, assign it to the newly created function and
+        // adjust all operands.
+        Instruction *NewI = I->clone();
+        VVMap[I] = NewI;
+        NewBB->getInstList().push_back(NewI);
+        for (Use &U: NewI->operands()) {
+          // Keep constant operands as is.
+          if (leaveConstants && isa<Constant>(U.get()))
+            continue;
+
+          // Replace other operands by their counter part in the new function
+          if (isLocalValue(U.get())) {
+            auto VVMI = VVMap.find(U.get());
+            if (VVMI == VVMap.end()) {
+              H->eraseFromParent();
+              H = nullptr;
+              return false;
+            }
+
+            U.set(VVMI->second);
+          }
+          else {
+            auto IOI = std::find_if(IncomingOperands.begin(), IncomingOperands.end(), [&U] (const Use* S) -> bool {
+                return S->get() == U.get();
+              });
+
+            // If the operand is not found, ignore it if and only if it is a constant.
+            if (IOI == IncomingOperands.end() && isa<Constant>(U.get()))
+              continue;
+            assert(IOI != IncomingOperands.end() && "Operand not in argument list");
+            size_t ArgIdx = IOI - IncomingOperands.begin();
+            U.set(H->arg_begin() + ArgIdx);
+          }
+        }
+
+        return true;
+      };
+      // EOL - Lambda cloneAndAppendInst
+
+      // Clone all instructions into the new function,
+      for (auto &I: Is) {
+        if (!cloneAndAppendInst(I, true))
+          return false;
+      }
+      // but allow constant arguments for the recursive call inst.
+      if (!cloneAndAppendInst(CI, false))
+        return false;
+
+      // Finally, append the terminator instruction.
+      ReturnInst::Create(H->getContext(), NewBB);
+
+      return true;
+    }
+
+    /// Return function H and create it if necessary.
+    Function *getFunctionH() {
+      if (!H) {
+        buildFunctionH();
+      }
+      return H;
+    }
+
+    /// Remove duplicates within the operands while preserving the order of each
+    /// first occurrence. The operands a given as use. For duplicates and
+    /// arbitrary use is selected to be kept.
+    void cleanupIncomingOperands() {
+      std::map<Value*, Use*> map;
+      std::vector<Use*> tmp(IncomingOperands.begin(), IncomingOperands.end());
+      IncomingOperands.clear();
+      for (auto &O: tmp) {
+        bool inserted;
+        std::tie(std::ignore, inserted) = map.insert(std::make_pair(O->get(), O));
+        if (inserted)
+          IncomingOperands.push_back(O);
+      }
+    }
+
+    /// Collect all required operand for instruction I. If allowConstants is set,
+    /// Constants are also put on the operand list.
+    void analyzeOperands(User::op_range Operands, Instruction *I, bool allowConstants) {
+      for (auto &U: Operands) {
+        if (!allowConstants && isa<Constant>(U.get()))
+          continue;
+
+        // If the operand is a non-local value add it to the list. This covers all
+        // movable instructions, as well as static instructions of other previous
+        // candidates. For movable instructions this is feasible because they
+        // instructions will remain (moved). On the other hand static instruction
+        // are removed. Referring to those is invalid. An example is depicted in
+        // the sequence below.
+        //
+        // ; Candidate 0
+        // call @recursive(%donotcare)
+        // ; ...
+        // ;
+        // ; Candidate n
+        // %a = load %something
+        // %b = load %somethingelse
+        // call @recursive(%a)
+        // ; ...
+        // ;
+        // ; Candidate m > n
+        // call @recursive(%b)
+        //
+        // This case is not detected within this call. However, because the
+        // function H of candidate n would have to return a non-void value, it is
+        // covered during buildFunctioH.
+        if (!isLocalValue(U.get())) {
+          IncomingOperands.push_back(&U);
+        }
+      }
+    }
+
+    /// Analyze if this candidate can be feasible for this transformation.
+    bool analyze() {
+      IncomingOperands.clear();
+      for (auto &I: Is) {
+        // In general it is not safe to replace a constant by an arbitrary value
+        // in the LLVM IR. allowConstants is there for set to false to avoid
+        // putting them onto the argument list.
+        analyzeOperands(I->operands(), I, false);
+      }
+      // As we know that all candidates are recursive only arg_operands are
+      // analyzed for the recursive call instruction. Also for CallInst
+      // arg_operands it is feasible and potentially profitable to replace
+      // constants with values. E.g.:
+      //
+      // call @recursive(%this, 0);
+      // call @recursive(%this, 1);
+      // ...
+      // call @recursive(%this, n);
+      // ret
+      analyzeOperands(CI->arg_operands(), CI, true);
+
+      // Remove duplicates from operand list.
+      cleanupIncomingOperands();
+
+      // Try to encapsulate call and all static instruction into an new separate function.
+      if (!buildFunctionH())
+        return false;
+
+      return true;
+    }
+
+    /// Test if O's function H behaves equally compared to the own function H.
+    bool compareFunctionH(RecursiveCall *O) const {
+      GlobalNumberState GN;
+      return FunctionComparator(H, O->H, &GN).compare() == 0;
+    }
+
+    /// Erase the CallInst from the source function.
+    void eraseSourceCallInst() {
+      CI->eraseFromParent();
+      CI = nullptr;
+    }
+
+    /// Erase all static instructions from the source function.
+    void eraseSourceInstructions() {
+      eraseSourceCallInst();
+      for (auto i = Is.rbegin(), e = Is.rend(); i != e; /* inloop */) {
+        (*(i++))->eraseFromParent();
+      }
+      Is.clear();
+    }
+
+    size_t getIndex() const {
+      return Index;
+    }
+
+  private:
+    /// Index, solely used for naming
+    size_t Index;
+
+    /// Recursive function the candidate is called within
+    Function &F;
+
+    /// Temporary function H (including the recursive call). This function is used
+    /// during analysis and transformation. It is never called and destroyed when
+    /// this object is deleted.
+    Function *H;
+
+    /// Recursive call instruction.
+    CallInst *CI;
+
+    /// Instructions that precede the current call.
+    SmallVector<Instruction*, 4> Is;
+    std::vector<Use*> IncomingOperands;
+  };
+
+  /// Helper class to collect and classify used arguments and analyze arguments
+  /// used by function H.
+  struct IncomingArgument {
+    /// Stacked arguments are arguments that need to be modeled on the stack
+    ///
+    /// Static arguments are arguments which are passed through to all
+    /// recursive calls without changing position nor value.
+    ///
+    /// e.g.:
+    ///   fn(a, b, c) {
+    ///    fn(a, b, c);
+    ///    fn(a + 1, a, c);
+    /// }
+    /// => StaticArguments = {c}
+    ///
+    /// Constant arguments are Constants that are the equal for each call except the first.
+    ///
+    /// UnknownKind are arguments which are not yet classified.
+    enum Kind {
+      StackedKind,
+      StaticKind,
+      ConstantKind,
+      UnknownKind,
+    };
+
+    IncomingArgument() : Uses(), K(UnknownKind) {}
+
+    /// Add an argument use and update the current classification.
+    void addUse(const RecursiveCall *CC, Use *U, bool CheckOnly = false) {
+      Value *V = U->get();
+      switch (K) {
+      case UnknownKind:
+        K = StackedKind;
+        if (isa<Constant>(V)) {
+          K = ConstantKind;
+        }
+        else if (Argument *A = dyn_cast<Argument>(V)) {
+          if (A == CC->getCallInst()->getArgOperand(A->getArgNo())) {
+            K = StaticKind;
+          }
+        }
+        Ty = V->getType();
+        break;
+
+      case StackedKind:
+        // Keep Stacked as Stacked.
+        break;
+
+      case ConstantKind:
+        // Change kind to stacked if the new argument is no constant or if the
+        // constant value differs.
+        if (Constant *C = dyn_cast<Constant>(V)) {
+          if (C != getValue())
+            K = StackedKind;
+        }
+        else {
+          K = StackedKind;
+        }
+        break;
+
+      case StaticKind:
+        // Change kind to stacked if the new argument is no argument or if the
+        // the position in the corresponding CallInst is unsuitable.
+        if (Argument *A = dyn_cast<Argument>(V)) {
+          if (A != CC->getCallInst()->getArgOperand(A->getArgNo())) {
+            K = StackedKind;
+          }
+        }
+        else {
+          K = StackedKind;
+        }
+        break;
+      }
+      assert(Ty == V->getType() && "Argument type mismatch");
+      if (!CheckOnly)
+        Uses.insert(std::make_pair(CC, U));
+    }
+
+    /// Get the used value of a specific candidate, or the used value of any
+    /// candidate.
+    Value *getValue(const RecursiveCall *CC = nullptr) const {
+      if (CC)
+        return Uses.find(CC)->second->get();
+      else
+        return Uses.begin()->second->get();
+    }
+
+    /// Get the type of this argument.
+    Type *getType() const {
+      return Ty;
+    }
+
+    /// Returns true it this argument is of stacked kind.
+    bool isStacked() const {
+      assert(K != UnknownKind && "Argument kind is unknown");
+      return K == StackedKind;
+    }
+
+    /// Returns true it this argument is of static kind.
+    bool isStatic() const {
+      assert(K != UnknownKind && "Argument kind is unknown");
+      return K == StaticKind;
+    }
+
+    /// Returns true it this argument is of constant kind.
+    bool isConstant() const {
+      assert(K != UnknownKind && "Argument kind is unknown");
+      return K == ConstantKind;
+    }
+
+  private:
+    /// Candidate-Use map of the argument.
+    std::map<const RecursiveCall *, Use *> Uses;
+
+    /// Classification.
+    Kind K;
+
+    /// Argument type.
+    Type *Ty;
+  };
+
+  struct TrueMarker {
+    using ItemListContainer = SmallVectorImpl<Type*>;
+
+    TrueMarker(LLVMContext &C, unsigned OtherCandidatesNumOf) : C(C), OtherCandidatesNumOf(OtherCandidatesNumOf) {}
+    virtual ~TrueMarker() {}
+
+    virtual bool mayPullFromFreeList() {
+      // this is the default behavior
+      return true;
+    }
+
+    virtual bool checkListType(Type *ListTy) {
+      return true;
+    }
+
+    virtual void adjustStructFields(ItemListContainer &ItemList) {};
+
+    virtual void addSetupMarkerInst(Value *LinkPtr, unsigned EntryIndex,
+                            Instruction *InsertPos, const DebugLoc &DebugLoc, const DataLayout &DL) {
+    }
+
+    virtual Value *addComputeMarkInst(Value *ListItem, Value *ListHead,
+                                      Value *NextPtr, Instruction *InsertPos, const DebugLoc &DebugLoc) {
+      return ConstantInt::getTrue(IntegerType::get(C, 1));
+    }
+
+    virtual unsigned getItemIndex(unsigned Item) {
+      return Item;
+    }
+
+    LLVMContext &C;
+    const unsigned OtherCandidatesNumOf;
+  };
+
+  struct FalseMarker : public TrueMarker {
+    using base = TrueMarker;
+
+    FalseMarker(LLVMContext &C, unsigned OtherCandidatesNumOf) : base(C, OtherCandidatesNumOf) {}
+
+    bool mayPullFromFreeList() override {
+      return false;
+    }
+
+    Value *addComputeMarkInst(Value *ListItem, Value *ListHead,
+                              Value *NextPtr, Instruction *InsertPos, const DebugLoc &DebugLoc) override {
+      return ConstantInt::getFalse(IntegerType::get(C, 1));
+    }
+  };
+
+  struct FieldMarker : public TrueMarker {
+    using base = TrueMarker;
+
+    FieldMarker(LLVMContext &C, unsigned OtherCandidatesNumOf) : base(C, OtherCandidatesNumOf), Index(-1) {}
+
+    void adjustStructFields(ItemListContainer &ItemList) override {
+      Index = ItemList.size();
+      ItemList.push_back(IntegerType::get(C, 1));
+    }
+
+    void addSetupMarkerInst(Value *LinkPtr, unsigned EntryIndex,
+                            Instruction *InsertPos, const DebugLoc &DebugLoc, const DataLayout &DL) override {
+      Value *Idx[2] = {
+        ConstantInt::get(DL.getIntPtrType(C, DL.getAllocaAddrSpace()), 0),
+        ConstantInt::get(IntegerType::get(C, 32), Index) };
+
+      Type *LinkTy = cast<PointerType>(LinkPtr->getType())->getElementType();
+      Instruction *MarkPtrGEP = GetElementPtrInst::CreateInBounds(LinkTy,
+                                                                  LinkPtr,
+                                                                  makeArrayRef(Idx, 2),
+                                                                  "rse.markptr." + Twine(EntryIndex),
+                                                                  InsertPos);
+      MarkPtrGEP->setDebugLoc(DebugLoc);
+
+      Value *Mark;
+      if (EntryIndex < OtherCandidatesNumOf - 1) {
+        Mark = ConstantInt::getFalse(IntegerType::get(C, 1));
+      }
+      else {
+        Mark = ConstantInt::getTrue(IntegerType::get(C, 1));
+      }
+
+      StoreInst *SMI = new StoreInst(Mark,
+                                     MarkPtrGEP,
+                                     false,
+                                     DL.getABITypeAlignment(Mark->getType()),
+                                     InsertPos);
+      SMI->setDebugLoc(DebugLoc);
+    }
+
+    Value *addComputeMarkInst(Value *ListItem, Value *ListHead,
+                              Value *NextPtr, Instruction *InsertPos, const DebugLoc &DebugLoc) override {
+      assert(Index != (unsigned)-1 && "Marker field index unknown");
+
+      Instruction *Mark = ExtractValueInst::Create(
+          ListItem, makeArrayRef(&Index, 1), "rse.mark", InsertPos);
+      Mark->setDebugLoc(DebugLoc);
+      return Mark;
+    }
+
+    unsigned Index;
+  };
+
+  struct CompareMarker : public TrueMarker {
+    using base = TrueMarker;
+
+    CompareMarker(LLVMContext &C, unsigned OtherCandidatesNumOf) : base(C, OtherCandidatesNumOf) {}
+
+    unsigned getItemIndex(unsigned Item) override {
+      if (STACKDIRECTION == TargetFrameLowering::StackGrowsDown)
+        return OtherCandidatesNumOf - 1 - Item;
+      else
+        return Item;
+    }
+
+    Value *addComputeMarkInst(Value *ListItem, Value *ListHead,
+                              Value *NextPtr, Instruction *InsertPos, const DebugLoc &DebugLoc) override {
+      Instruction *Mark;
+      if (STACKDIRECTION == TargetFrameLowering::StackGrowsDown)
+        // TODO (Correctness): What if NextPtr == nullptr, this is currently a
+        // false negative which unnecessarily spills the stack. This case has to
+        // be handled efficiently (e.g.: use (unsigned)-1 instead of nullptr,
+        // iff this is valid on the target machine)
+        Mark = new ICmpInst(InsertPos, CmpInst::ICMP_ULT, ListHead, NextPtr, "res.mark");
+      else
+        Mark = new ICmpInst(InsertPos, CmpInst::ICMP_UGT, ListHead, NextPtr, "res.mark");
+      Mark->setDebugLoc(DebugLoc);
+      return Mark;
+    }
+  };
+
+  typedef std::forward_list<RecursiveCall> CandidateContainer;
+
+  /// Current DataLayout.
+  const DataLayout &DL;
+
+  /// Current Context.
+  LLVMContext &C;
+
+  /// Optimization Remark Emitter to use.
+  OptimizationRemarkEmitter &ORE;
+
+  /// Function the pass is working on.
+  Function &F;
+
+  /// Alias Analysis of the current function.
+  AliasAnalysis &AA;
+
+  /// Type of list elements for the selected marker algorithm;
+  StructType *ListTy;
+
+  /// List of call candidates to be eliminated, this list contains the single
+  /// first candidate.
+  CandidateContainer FirstCandidate;
+
+  /// List of the remaining call candidates to be eliminated.
+  CandidateContainer OtherCandidates;
+
+  /// Number of elements in the OtherCandidates List
+  unsigned OtherCandidatesNumOf;
+
+  /// List of arguments for the function H.
+  SmallVector<IncomingArgument, 4> IncomingArguments;
+
+  /// List of movable instructions.
+  SmallVector<Instruction*, 16> MovableInst;
+
+  /// List of static instructions.
+  SmallVector<Instruction*, 16> StaticInst;
+
+  /// Selected Marker Algorithm
+  std::unique_ptr<TrueMarker> Marker;
+
+  /// Find all recursive calls within the current basic block and add them to
+  /// the list of candidates.
+  ///
+  /// \returns true if at leas one recursive call is found.
+  bool appendAllRecursiveCallsInBB(Function &F,
+                             BasicBlock &BB,
+                             unsigned &NumOf,
+                             CandidateContainer &Candidates,
+                             CandidateContainer::iterator &LastCandidate) {
+    bool found = false;
+    for (auto &I: BB) {
+      CallInst *CI = dyn_cast<CallInst>(&I);
+      if (CI && CI->getCalledFunction() == &F) {
+        LastCandidate = Candidates.emplace_after(LastCandidate, CI, NumOf);
+        NumOf++;
+        found = true;
+      }
+    }
+    return found;
+  }
+
+  /// Test if the given basic block is a return only BB.
+  bool isReturnOnlyBB(const BasicBlock &BB) const {
+    const ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator());
+    return (RI != nullptr && BB.getFirstNonPHIOrDbg() == RI);
+  }
+
+  /// Return true if it is safe to move the specified
+  /// instruction from after the call to before the call, assuming that all
+  /// instructions between the call and this instruction are movable.
+  ///
+  /// This is a copy of canMoveAboveCall within TailRecursionElimination
+  ///
+  /// \returns true if the instruction can be moved.
+  bool canMoveAbove(Instruction *I, Instruction *SI) {
+    // FIXME: We can move load/store/call/free instructions above the call if the
+    // call does not mod/ref the memory location being processed.
+    if (I->mayHaveSideEffects())  // This also handles volatile loads.
+      return false;
+
+    if (LoadInst *L = dyn_cast<LoadInst>(I)) {
+      // Loads may always be moved above calls without side effects.
+      if (SI->mayHaveSideEffects()) {
+        const DataLayout &DL = L->getModule()->getDataLayout();
+        if (isModSet(AA.getModRefInfo(SI, MemoryLocation::get(L))) ||
+            !isSafeToLoadUnconditionally(L->getPointerOperand(),
+                                         L->getAlignment(), DL, L))
+          return false;
+      }
+    }
+
+    // Otherwise, if this is a side-effect free instruction, check to make sure
+    // that it does not use the return value of the call.  If it doesn't use the
+    // return value of the call, it must only use things that are defined before
+    // the call, or movable instructions between the call and the instruction
+    // itself.
+    return !is_contained(I->operands(), SI);
+  }
+
+  /// Analyze the current function and return true if it is feasible and
+  /// profitable to apply the transformation.
+  ///
+  /// \returns the number of candidates found.
+  bool analyzeFunction() {
+    // Number of basic blocks with all recursive calls
+    int BBwithCandidates = 0;
+    bool hasAllocas = false;
+
+    CandidateContainer Candidates;
+    auto LastCandidate = Candidates.before_begin();
+
+    // We cannot handle variadic arguments.
+    if (F.isVarArg())
+      return false;
+
+    // TODO (improvement): There might be ways to implement non void returns. For example
+    // similar to accumulator TREs.
+    if (F.getReturnType() != Type::getVoidTy(F.getContext()))
+      return false;
+
+    // No, can't be done.
+    if (F.callsFunctionThatReturnsTwice())
+      return false;
+
+    unsigned CandidatesNumOf = 0;
+    for (auto &BB: F) {
+      // EH pads cannot be handled at all
+      if (BB.isEHPad()) {
+        return false;
+      }
+
+      // Alloca instructions are not allowed.
+      //
+      // TODO (improvement): Alloca can be theoretically be handled if it can be
+      // proven that the used memory lifetime is bounded before the first call.
+      // In this case the alloca can be moved into the new entry block the same
+      // way tail recursion does.
+      for (auto &I: BB) {
+        if (isa<AllocaInst>(&I)) {
+          hasAllocas = true;
+        }
+      }
+
+      if (isa<ReturnInst>(BB.getTerminator())) {
+        // For returning blocks find all candidates.
+        if (appendAllRecursiveCallsInBB(F, BB, CandidatesNumOf, Candidates, LastCandidate)) {
+          BBwithCandidates++;
+        }
+      }
+      else if (BranchInst *BI = dyn_cast<BranchInst>(BB.getTerminator())) {
+        // For non-returning blocks find all candidates, iff it has a single
+        // successor which is a return only block.
+        if (BI->isUnconditional() && isReturnOnlyBB(*BI->getSuccessor(0))) {
+          if (appendAllRecursiveCallsInBB(F, BB, CandidatesNumOf, Candidates, LastCandidate)) {
+            BBwithCandidates++;
+          }
+        }
+      }
+
+      // Currently only one BB with tail recursive calls is allowed.
+      //
+      // TODO (improvement): There might be interesting cases with recursive
+      // calls in more then one BB.
+      if (BBwithCandidates > 1) {
+        ORE.emit([&]() {
+            return  OptimizationRemarkMissed(DEBUG_TYPE, "Recursion elimination missed",
+                                             F.getEntryBlock().front().getDebugLoc(), &F.getEntryBlock())
+              << "Multiple Basic Blocks";
+          });
+        return false;
+      }
+    }
+
+    // No RSE required. Leave to TRE.
+    if (CandidatesNumOf < 2) {
+      return false;
+    }
+
+    if (hasAllocas) {
+      ORE.emit([&]() {
+          return  OptimizationRemarkMissed(DEBUG_TYPE, "Recursion elimination missed",
+                                           F.getEntryBlock().front().getDebugLoc(), &F.getEntryBlock())
+            << "Functions containing alloca instructions not yet supported";
+        });
+      return false;
+    }
+
+    BasicBlock *BB = Candidates.front().getCallInst()->getParent();
+    Instruction *TI = BB->getTerminator();
+
+    // All instructions between the first candidate and the last candidate are
+    // partitioned as static (unmovable) instructions and instructions (movable)
+    // The resulting two arrays are ordered in the required execution sequence.
+    //
+    // This means that the execution order (MovableInst | StaticInst) is legal
+    // and equivalent to the original sequence.
+    //
+    // Algorithm:
+    //
+    // * Iterate over all instructions between Candidate[n] and Candidate[n+1].
+    //
+    // * Test if the instruction can be moved above all identified static
+    //   instructions.
+    //
+    // * If it can, add it to the movable instruction list, otherwise add it to
+    //   the static instruction list.
+    //
+    // * Consecutively add static instructions if necessary.
+
+    // Iterate over all instruction the interval
+    // )Candidate[x]. Candidate[x+1]). Virtually include the terminator as
+    // additional candidate.
+    //
+    // Setup the first iteration interval between Candidate[0] and Candidate[1].
+    // Note: The list has at least two members.
+    auto Cand0 = Candidates.begin();
+    auto Cand1 = next(Cand0);
+    BasicBlock::iterator BBI(Cand0->getCallInst());
+    auto CandI = Cand1;
+
+    // The first candidate is always a single call which is a static instruction.
+    StaticInst.push_back(&*BBI);
+
+    while (true) {
+      // Identity the instruction that marks the end of the current pair.
+      //
+      // It is either the CandI, or the terminator instruction (which is virtually added)
+      Instruction *E;
+      if (CandI != Candidates.end()) {
+        E = CandI->getCallInst();
+      }
+      else {
+        E = TI;
+      }
+
+      // advance the iterator first, because we start with the first call
+      // instruction which is already added.
+      ++BBI;
+
+      // If we reached the terminator we are done.
+      if (&*BBI == TI)
+        break;
+
+      // The current search intervals end candidate is reached?
+      if (&*BBI == E) {
+        StaticInst.push_back(&*BBI);
+        // Advance the current iteration interval to next candidate.
+        ++CandI;
+        continue;
+      }
+
+      // Test if the instruction can be moved and add it accordingly.
+      bool movable = true;
+      for (auto &SI: StaticInst) {
+        if (canMoveAbove(&*BBI, &*SI))
+          continue;
+        movable = false;
+        break;
+      }
+
+      if (movable)
+        MovableInst.push_back(&*BBI);
+      else
+        StaticInst.push_back(&*BBI);
+    }
+
+    // Assign all static instruction to the candidates. We do it naively by just
+    // assigning the instruction in between to candidates to second one.
+    auto SI = StaticInst.begin();
+    for (auto &Cand: Candidates) {
+      while (*SI != Cand.getCallInst()) {
+        Cand.addInstruction(*SI);
+        SI++;
+      }
+      // Skip candidate's call inst.
+      SI++;
+    }
+
+    // If not all static instructions have been assigned, then there are unmovable tailing
+    // instructions and the transformation is abandoned.
+    if (SI != StaticInst.end()) {
+      ORE.emit([&]() {
+          return  OptimizationRemarkMissed(DEBUG_TYPE, "Recursion elimination missed", *SI)
+            << "Instructions after last call.";
+        });
+      return false;
+    }
+
+    // Analyze each call candidate if the call and the corresponding static
+    // instructions are suitable to be eliminated.
+    for (auto &Cand: Candidates) {
+      if (!Cand.analyze()) {
+        ORE.emit([&]() {
+            return  OptimizationRemarkMissed(DEBUG_TYPE, "Recursion elimination missed", Cand.getCallInst())
+              << "RecursiveCall nonviable";
+          });
+        return false;
+      }
+    }
+
+    // Test if the functions generated during analysis (step before) behave equivalently.
+    for (auto CCI = std::next(Cand1), CCE = Candidates.end();
+         CCI != CCE; ++CCI) {
+
+      // Test if the function H(...) of both candidates match.
+      if (!CCI->compareFunctionH(&*Cand1)) {
+        ORE.emit([&]() {
+            return  OptimizationRemarkMissed(DEBUG_TYPE, "Recursion elimination missed", CCI->getCallInst())
+              << "RecursiveCall nonviable (Function H do not match)";
+          });
+        return false;
+      }
+    }
+
+    // Analyze the operands for each function H and categorize them into
+    // Stacked, Constant and Static.
+    IncomingArguments.resize(Cand1->getIncomingOperands().size(), IncomingArgument());
+    for (auto CCI = Cand1, CCE = Candidates.end(); CCI != CCE; ++CCI) {
+      assert(IncomingArguments.size() == CCI->getIncomingOperands().size() && "Operand size mismatch");
+
+      auto IAI = IncomingArguments.begin();
+      for (Use *U: CCI->getIncomingOperands()) {
+        IAI->addUse(&*CCI, U);
+
+        // If the value is an Argument we need to check the position in the
+        // first candidate too. It is sufficient here to just execute the checks.
+        if (isa<Argument>(U->get())) {
+          IAI->addUse(&*Cand0, U, true);
+        }
+        ++IAI;
+      }
+    }
+
+    // Count the categories.
+    int NumStaticArguments = 0;
+    int NumConstantArguments = 0;
+    for (auto &IA: IncomingArguments) {
+      if (IA.isConstant())
+        NumConstantArguments++;
+      if (IA.isStatic())
+        NumStaticArguments++;
+    }
+
+    // Estimate costs and gain.
+    if (IncomingArguments.size() - NumStaticArguments - NumConstantArguments + 2 > F.arg_size()) {
+      LLVM_DEBUG(dbgs() << "Will not apply - too costly\n");
+      ORE.emit([&]() {
+          return  OptimizationRemarkMissed(DEBUG_TYPE, "Recursion elimination missed", Cand0->getCallInst())
+            << "Too costly";
+        });
+      return false;
+    }
+
+    OtherCandidatesNumOf = CandidatesNumOf - 1;
+
+    if (DisableStackCleanup) {
+      Marker.reset(new FalseMarker(C, OtherCandidatesNumOf));
+    }
+    else {
+      if (OtherCandidatesNumOf == 1) {
+        Marker.reset(new TrueMarker(C, OtherCandidatesNumOf));
+      }
+      else {
+        Marker.reset(new CompareMarker(C, OtherCandidatesNumOf));
+        //Marker.reset(new FieldMarker(C, OtherCandidatesNumOf));
+      }
+    }
+
+    ListTy = createListType();
+    if (!Marker->checkListType(ListTy)) {
+      return false;
+    }
+
+    // Splice the list into the two output lists. One contains the first element
+    // only, the other containing the remaining elements.
+    OtherCandidates.splice_after(OtherCandidates.before_begin(), Candidates, Candidates.begin(), Candidates.end());
+    FirstCandidate.splice_after(FirstCandidate.before_begin(), Candidates);
+
+    return true;
+  }
+
+  /// Creates the worklist structure used to model the pending call list.
+  StructType *createListType() const {
+    // Collect the types of the operands
+    SmallVector<Type*,4> ListElements;
+    for (auto &IA: IncomingArguments) {
+      if (IA.isStacked())
+        ListElements.push_back(IA.getType());
+    }
+
+    // create a new StructType representing the list.
+    StructType *ListTy = StructType::create(C, "rse.listty");
+    StructType *OperandsTy = StructType::get(C, makeArrayRef(ListElements), false);
+
+    SmallVector<Type*, 2> LinkItems;
+    LinkItems.push_back(ListTy->getPointerTo());
+    Marker->adjustStructFields(LinkItems);
+    StructType *LinkTy = StructType::get(C, makeArrayRef(LinkItems), false);
+
+    Type *ListItemTys[2] = { OperandsTy, LinkTy };
+    ListTy->setBody(makeArrayRef(ListItemTys, 2), false);
+
+    return ListTy;
+  }
+
+  /// Prepare the entry block by splitting of the loop entry. The following
+  /// PHINodes are added to the loop entry.
+  ///
+  /// * One for each argument (Also the uses are adjusted).
+  /// * List head representing the root of the worklist.
+  /// * If enabled the root to the free list.
+  template <typename ArgumentPHIContainer>
+  std::tuple<PHINode*,
+             PHINode*,
+             BasicBlock*>
+  prepareEntry(ArgumentPHIContainer &ArgumentPHIs) {
+    const DebugLoc &DebugLoc = FirstCandidate.front().getCallInst()->getDebugLoc();
+
+    // Split the entry block at the first viable instruction.
+    BasicBlock *Entry = &F.getEntryBlock();
+    BasicBlock *LoopEntry = Entry->splitBasicBlock(Entry->getFirstNonPHI(), "rse.loop");
+    Entry->getTerminator()->setDebugLoc(DebugLoc);
+
+    // Setup List PHI nodes for the loop entry.
+    Instruction *InsertPos = &LoopEntry->front();
+    PHINode *ListHead = PHINode::Create(ListTy->getPointerTo(), 2,
+                                        "rse.list",
+                                        InsertPos);
+    ListHead->addIncoming(ConstantPointerNull::get(ListTy->getPointerTo()), Entry);
+
+    /// Setup the FreeList if necessary.
+    PHINode *FreeList = nullptr;
+    FreeList = PHINode::Create(ListTy->getPointerTo(), 2,
+                               "rse.freelist",
+                               InsertPos);
+    FreeList->addIncoming(ConstantPointerNull::get(ListTy->getPointerTo()), Entry);
+
+    // Setup Argument PHI nodes for the loop entry - see TRE.
+    for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end();
+         I != E; ++I) {
+      PHINode *PN = PHINode::Create(I->getType(), 2,
+                                    "res." + I->getName(), InsertPos);
+      I->replaceAllUsesWith(PN); // Everyone use the PHI node now!
+      PN->addIncoming(&*I, Entry);
+      ArgumentPHIs.push_back(PN);
+    }
+
+    return std::make_tuple(ListHead, FreeList, LoopEntry);
+  }
+
+  /// Adjust all ArgumentPHIs according to the CallInst to be eliminated.
+  ///
+  /// Arguments are taken from CI. IncomingBB is used as the BB to add into to
+  /// PHI list.
+  template <typename ArgumentPHIContainer>
+  void adjustPHIsFromCallInst(
+      BasicBlock *IncomingBB, CallInst *CI,
+      const ArgumentPHIContainer &ArgumentPHIs) {
+    // Adjust PHI Nodes in the loop entry
+    CallInst::op_iterator Arg = CI->arg_begin();
+    for (auto &PHI: ArgumentPHIs) {
+      assert(Arg != CI->arg_end() && "Callsite does not provide enough arguments.");
+      PHI->addIncoming(Arg->get(), IncomingBB);
+      Arg++;
+    }
+    assert(Arg == CI->arg_end() && "Callsite provided too many arguments.");
+  }
+
+  /// Eliminates the first call just like it is done within tail call
+  /// elimination.  Additionally it moves all movable instructions before the
+  /// first call candidate, and it splits the remaining calls into a separate
+  /// basic block. ArgumentPHIs are adjusted accordingly.
+  //
+  // calling.bb:
+  //   ...
+  //   call @recursive ...
+  //
+  //   %movable1 = add ...
+  //   %static1 = call @fn1, %movable1
+  //   call @recursive ...
+  //
+  //   %movable2 = add ...
+  //   %static2 = call @fn1, %movable2
+  //   call @recursive ...
+  //
+  //   br ret
+  //
+  // ->
+  //
+  // calling.bb:
+  //   ...
+  //   %movable1 = add ...
+  //   %movable2 = add ...
+  //   br %loopentry
+  //
+  // H1:
+  //
+  //   %static1 = call @fn1, %movable1
+  //   call @recursive ...
+  //
+  //   %static2 = call @fn1, %movable2
+  //   call @recursive ...
+  //
+  //   br ret
+  template <typename ArgumentPHIContainer>
+  std::tuple<BasicBlock*, BasicBlock*>
+  eliminateFirstCall(RecursiveCall &CC,
+                     BasicBlock *LoopEntry,
+                     const ArgumentPHIContainer &ArgumentPHIs) {
+    CallInst *CI = CC.getCallInst();
+
+    // Move all moveable instruction before this call instruction.
+    for (auto &I: MovableInst)
+      I->moveBefore(CI);
+
+    // Split off the remaining static (unmovable instructions).
+    BasicBlock *CallerBB = CI->getParent();
+    BasicBlock *StaticInstBB = CallerBB->splitBasicBlock(CI, "rse.H1");
+    BranchInst *BI = cast<BranchInst>(CallerBB->getTerminator());
+
+    // Create the loop to eliminate the call the the first candidate.
+    BI->setSuccessor(0, LoopEntry);
+    BI->setDebugLoc(CI->getDebugLoc());
+
+    adjustPHIsFromCallInst(CallerBB, CI, ArgumentPHIs);
+    CC.eraseSourceCallInst();
+
+    return std::make_tuple(CallerBB, StaticInstBB);
+  }
+
+  /// Adds code to bypass the alloca (inserting by eliminateOtherCalls) with
+  /// code to picking it from the freelist.
+  //
+  // calling.bb:
+  //   ...
+  //   %movable.0 = add ...
+  //   %movable.k = add ...
+  //
+  //   ; allocate memory on stack
+  //   %itemptr = alloca %listty, 2
+  //
+  //   ; compute address if list items
+  //   %gep.0 = getelementptr %listty, 0
+  //   ...
+  //   %gep.k = getelementptr %listty, k
+  //
+  //   ; setup and store list item [0]; set mark to false
+  //   mark.0 = <marker instructions>
+  //   %nextptr.0 = getelementptr %listty, itemptr, 0, 1, 0
+  //   store %gep.1, *nextptr.0
+  //   ...
+  //   mark.k = <marker instructions>
+  //   %nextptr.k = getelementptr %listty, itemptr, k, 1, 0
+  //   store %listhead, %nextptr.k
+  //
+  //   ; setup and store operands entries
+  //   %opreands.0 = getelementptr %listty, itemptr, 0, 0
+  //   %arg.0-0 = insertvalue mark_0, undef, %movable.0-0
+  //   store %arg.0-0, %operands.0
+  //   ...
+  //
+  //   -> [AI = %itemptr, AllocaEnd = %nextptr.k, NewHeadPtr = %gep.0]
+  //
+  // calling.bb:
+  //   ...
+  //   %movable.0 = add ...
+  //   %movable.k = add ...
+  //
+  // alloca:
+  //   ; allocate memory on stack
+  //   %itemptr = alloca %listty, 2
+  //
+  //   ; compute address if list items
+  //   %gep.0 = getelementptr %listty, 0
+  //   ...
+  //   %gep.k = getelementptr %listty, k
+  //
+  //   ; setup and store list item [0]; set mark to false
+  //   mark.0 = <marker instructions>
+  //   %nextptr.0 = getelementptr %listty, %itemptr, 0, 1, 0
+  //   store %gep.1, *nextptr.0
+  //   ...
+  //   mark.k = <marker instructions>
+  //
+  // allocfromfree:
+  //   %gep.0.fromfree = getelementptr %listty, %freelist, ...
+  //   %nextptr = getelementptr %listty, 0, #STRUCT_IDX_OFNEXT
+  //   %next = load %nextptr
+  //   br calling.bb.1
+  //
+  // calling.bb.1:
+  //   %newheadptrphi = phi %listty, [%gep.0, %allocaa], [%gep.0.fromfree.fromfree, %allocfromfree]
+  //   %allocated = phi %listty, [%itemptr, %alloca], [%freelist, %allocfromfree]
+  //   %newfreelist = phi %listty, [%freelist, %alloca], [%next, %allocfromfree]
+  //
+  //   %nextptr.k = getelementptr %listty, %allocated, k, 1, 0
+  //   store %listhead, %nextptr.k
+  //
+  //   ; setup and store operands entries
+  //   %opreands.0 = getelementptr %listty, %allocated, 0, 0
+  //   %arg.0-0 = insertvalue mark_0, undef, %movable.0-0
+  //   store %arg.0-0, %operands.0
+  //   ...
+  Instruction *replaceAllocaWithFreeList(AllocaInst *AI, Instruction *AllocaEnd, Instruction *NewHeadPtr, PHINode *FreeList) {
+    // Never generate code to pull from free list if not necessary
+    if (!Marker->mayPullFromFreeList()) {
+      FreeList->addIncoming(FreeList, AI->getParent());
+      return NewHeadPtr;
+    }
+
+    const DebugLoc &DebugLoc = AI->getDebugLoc();
+    BasicBlock *PreCallingBB = AI->getParent();
+    BasicBlock::iterator InsertPos(AI);
+
+    BasicBlock *AllocaBB = PreCallingBB->splitBasicBlock(InsertPos, "rse.alloca");
+
+    InsertPos = BasicBlock::iterator(AllocaEnd);
+    BasicBlock *CallingBB = AllocaBB->splitBasicBlock(InsertPos, PreCallingBB->getName() + ".1");
+
+    PHINode *NewHeadPHI = PHINode::Create(ListTy->getPointerTo(), 2, "res.newhead", CallingBB->getFirstNonPHI());
+    NewHeadPHI->addIncoming(NewHeadPtr, AllocaBB);
+
+    BasicBlock *PullBB = BasicBlock::Create(C, "rse.allocfromfree", &F, CallingBB);
+
+    // compute of last next ptr
+    Value *IdxNext[] = {
+      ConstantInt::get(DL.getIntPtrType(C, DL.getAllocaAddrSpace()), Marker->getItemIndex(0)),
+      ConstantInt::get(IntegerType::get(C, 32), 1),
+      ConstantInt::get(IntegerType::get(C, 32), 0),
+    };
+    Instruction *PulledItem0 = GetElementPtrInst::CreateInBounds(ListTy,
+                                                                   FreeList,
+                                                                   makeArrayRef(IdxNext, 1),
+                                                                   "rse.pulleditemgep.0",
+                                                                   PullBB);
+    PulledItem0->setDebugLoc(DebugLoc);
+    NewHeadPHI->addIncoming(PulledItem0, PullBB);
+
+    IdxNext[0] = ConstantInt::get(DL.getIntPtrType(C, DL.getAllocaAddrSpace()), Marker->getItemIndex(OtherCandidatesNumOf - 1));
+    Instruction *NewFreePtr = GetElementPtrInst::CreateInBounds(ListTy,
+                                                                FreeList,
+                                                                makeArrayRef(IdxNext, 3),
+                                                                "rse.freelist.nextptr",
+                                                                PullBB);
+    NewFreePtr->setDebugLoc(DebugLoc);
+
+    LoadInst *NewFree = new LoadInst(NewFreePtr,
+                                     "rse.newfreelist",
+                                     false,
+                                     DL.getABITypeAlignment(ListTy->getPointerTo()),
+                                     PullBB);
+    NewFree->setDebugLoc(DebugLoc);
+
+    BranchInst *BI = BranchInst::Create(CallingBB, PullBB);
+    BI->setDebugLoc(DebugLoc);
+
+    InsertPos = BasicBlock::iterator(CallingBB->getFirstNonPHI());
+    PHINode *AllocatedPHI = PHINode::Create(ListTy->getPointerTo(), 2,
+                                            "rse.allocated",
+                                            &*InsertPos);
+    AllocatedPHI->setDebugLoc(DebugLoc);
+
+    // replace all remaining uses of AI with the new PHInode
+    for (auto i = AI->use_begin(), e = AI->use_end(); i != e; /* in loop */) {
+      Use &U = *(i++);
+      if (Instruction *I = dyn_cast<Instruction>(U.getUser())) {
+        if (I->getParent() == CallingBB)
+          U.set(AllocatedPHI);
+      }
+    }
+
+    AllocatedPHI->addIncoming(AI, AllocaBB);
+    AllocatedPHI->addIncoming(FreeList, PullBB);
+
+    PHINode *UpdatedFreeList = PHINode::Create(ListTy->getPointerTo(), 2,
+                                               "rse.freelist.next",
+                                               &*InsertPos);
+    UpdatedFreeList->setDebugLoc(DebugLoc);
+    UpdatedFreeList->addIncoming(FreeList, AllocaBB);
+    UpdatedFreeList->addIncoming(NewFree, PullBB);
+    FreeList->addIncoming(UpdatedFreeList, UpdatedFreeList->getParent());
+
+    InsertPos = BasicBlock::iterator(PreCallingBB->getTerminator());
+    ICmpInst *CmpI = new ICmpInst(&*InsertPos,
+                                  ICmpInst::ICMP_EQ,
+                                  FreeList,
+                                  ConstantPointerNull::get(ListTy->getPointerTo()),
+                                  "rse.cmp_freelist_empty");
+    CmpI->setDebugLoc(DebugLoc);
+
+    BI = BranchInst::Create(AllocaBB, PullBB, CmpI);
+    BI->setDebugLoc(DebugLoc);
+    ReplaceInstWithInst(PreCallingBB->getTerminator(), BI);
+
+    return NewHeadPHI;
+  }
+
+  /// Eliminate other recursive calls by adding the structures to the itemlist.
+  //
+  // calling.bb:
+  //   ...
+  //   %movable.0 = add ...
+  //   %movable.k = add ...
+  //   br %loopentry
+  //
+  // ->
+  //
+  // calling.bb:
+  //   ...
+  //   %movable.0 = add ...
+  //   %movable.k = add ...
+  //
+  //   ; allocate memory on stack
+  //   %itemptr = alloca %listty, 2
+  //
+  //   ; compute address if list items
+  //   %gep_0 = getelementptr %listty, 0
+  //   ...
+  //   %gep_k = getelementptr %listty, k
+  //
+  //   ; setup and store list item [0]; set mark to false
+  //   mark.0 = <marker instructions>
+  //   %nextptr.0 = getelementptr %listty, itemptr, 0, 1, 0
+  //   store %gep.1, *nextptr.0
+  //   ...
+  //   mark.k = <marker instructions>
+  //   %nextptr.k = getelementptr %listty, itemptr, k, 1, 0
+  //   store %listhead, %nextptr.k
+  //
+  //   ; setup and store operands entries
+  //   %opreands.0 = getelementptr %listty, itemptr, 0, 0
+  //   %arg.0-0 = insertvalue mark_0, undef, %movable.0-0
+  //   store %arg.0-0, %operands.0
+  //   ...
+  //   %opreands.k = getelementptr %listty, itemptr, k, 0
+  //   %arg.k-0 = insertvalue mark_1, undef, %movable.k-0
+  //   store %arg.k-0, %operands.k
+  //
+  //   br %loopentry
+  //
+  // ; below a reference to the source instruction
+  //
+  // H1:
+  //
+  //   %static.0 = call @fn1, %movable.0
+  //   call @recursive ...
+  //
+  //   %static.k = call @fn1, %movable.k
+  //   call @recursive ...
+  //
+  //   br ret
+  void eliminateOtherCalls(CandidateContainer &OtherCandidates,
+                           BasicBlock *CallerBB,
+                           PHINode *ListHead,
+                           PHINode *FreeList) {
+    Instruction *InsertPos = CallerBB->getTerminator();
+    Type *OperandsTy = ListTy->getTypeAtIndex(0U);
+
+    // Add an entry to the list for all candidates except the first. This to
+    // eliminates the calls to those candidates.
+    AllocaInst *AllocaPtr = new AllocaInst(ListTy,
+                                             DL.getAllocaAddrSpace(),
+                                             ConstantInt::get(DL.getIntPtrType(C, DL.getAllocaAddrSpace()), OtherCandidatesNumOf),
+                                             DL.getABITypeAlignment(ListTy),
+                                             "rse.itemptr",
+                                             InsertPos);
+    AllocaPtr->setDebugLoc(InsertPos->getDebugLoc());
+
+    SmallVector<Instruction*, 4> ItemGEPs;
+    ItemGEPs.reserve(OtherCandidatesNumOf);
+    //    OperandsGEPs.reserve(OtherCandidatesNumOf);
+    //    LinkGEPs.reserve(OtherCandidatesNumOf);
+    unsigned idx = 0;
+    for (auto &Cand: OtherCandidates) {
+      Value *Idx = ConstantInt::get(DL.getIntPtrType(C, DL.getAllocaAddrSpace()), Marker->getItemIndex(idx));
+      Instruction *ItemGEP =  GetElementPtrInst::CreateInBounds(ListTy,
+                                                                AllocaPtr,
+                                                                makeArrayRef(&Idx, 1),
+                                                                "rse.itemgep." + Twine(idx),
+                                                                InsertPos);
+      ItemGEP->setDebugLoc(Cand.getCallInst()->getDebugLoc());
+      ItemGEPs.push_back(ItemGEP);
+
+      idx++;
+    }
+
+    // Fill list link structure with the necessary values.  With a freelist
+    // available all values except the last nextptr are constant during this
+    // execution instance. The address values are all relative to the allocated
+    // address, and the last mark (if any) also do not need to change when
+    // pulling the element from the freelist. AllocaEnd marks the first
+    // instruction that must be executed when pulling from the freelist.
+    Instruction *AllocaEnd = nullptr;
+    idx = 0;
+    for (auto &Cand: OtherCandidates) {
+      const DebugLoc &DebugLoc = Cand.getCallInst()->getDebugLoc();
+
+      Value *Idx[3] = {
+        ConstantInt::get(DL.getIntPtrType(C, DL.getAllocaAddrSpace()), Marker->getItemIndex(idx)),
+        ConstantInt::get(IntegerType::get(C, 32), 1),
+        ConstantInt::get(IntegerType::get(C, 32), 0) };
+
+      Instruction *LinkPtrGEP = GetElementPtrInst::CreateInBounds(ListTy,
+                                                                  AllocaPtr,
+                                                                  makeArrayRef(Idx, 2),
+                                                                  "rse.linkptr." + Twine(idx),
+                                                                  InsertPos);
+      LinkPtrGEP->setDebugLoc(Cand.getCallInst()->getDebugLoc());
+      Marker->addSetupMarkerInst(LinkPtrGEP, idx, InsertPos, DebugLoc, DL);
+
+      Instruction *NextPtrGEP = GetElementPtrInst::CreateInBounds(ListTy,
+                                                                  AllocaPtr,
+                                                                  makeArrayRef(Idx, 3),
+                                                                  "rse.nextptr." + Twine(idx),
+                                                                  InsertPos);
+      NextPtrGEP->setDebugLoc(Cand.getCallInst()->getDebugLoc());
+
+      Value *NextVal;
+      if (idx < OtherCandidatesNumOf - 1) {
+        NextVal = ItemGEPs[idx+1];
+      }
+      else {
+        NextVal = ListHead;
+        AllocaEnd = NextPtrGEP;
+      }
+
+      StoreInst *SNPI = new StoreInst(NextVal,
+                                      NextPtrGEP,
+                                      false,
+                                      DL.getABITypeAlignment(NextVal->getType()),
+                                      InsertPos);
+      SNPI->setDebugLoc(DebugLoc);
+
+      idx++;
+    }
+
+    // Fill the operands part of the ListItems
+    idx = 0;
+    for (auto &Cand: OtherCandidates) {
+      const DebugLoc &DebugLoc = Cand.getCallInst()->getDebugLoc();
+
+      unsigned ValueIdx = 0;
+      Instruction *OperandsVal = nullptr;
+      for (auto &IA: IncomingArguments) {
+        if (!IA.isStacked())
+          continue;
+
+        Value *ValueToStore = IA.getValue(&Cand);
+
+        OperandsVal = InsertValueInst::Create((OperandsVal ? cast<Value>(OperandsVal) : cast<Value>(UndefValue::get(OperandsTy))),
+                                              ValueToStore,
+                                              makeArrayRef(&ValueIdx, 1),
+                                              "rse.arg." + Twine(idx) + ".Op" + Twine(ValueIdx),
+                                              InsertPos);
+        OperandsVal->setDebugLoc(DebugLoc);
+        ValueIdx++;
+      }
+
+      // OperandsVal might be null if no operands have to be stored in the list.
+      if (OperandsVal) {
+        // Create the GEP that points to the operand struct
+        Value *Idx[2] = {
+          ConstantInt::get(DL.getIntPtrType(C, DL.getAllocaAddrSpace()), Marker->getItemIndex(idx)),
+          ConstantInt::get(IntegerType::get(C, 32), 0) };
+        Instruction *OperandsGEP = GetElementPtrInst::CreateInBounds(ListTy,
+                                                                     AllocaPtr,
+                                                                     makeArrayRef(Idx, 2),
+                                                                     "rse.operandsgep." + Twine(idx),
+                                                                     InsertPos);
+        OperandsGEP->setDebugLoc(Cand.getCallInst()->getDebugLoc());
+        //        OperandsGEPs.push_back(OperandsGEP);
+
+        StoreInst *SOVI = new StoreInst(OperandsVal,
+                                        OperandsGEP,
+                                        false,
+                                        DL.getABITypeAlignment(OperandsTy),
+                                        InsertPos);
+        SOVI->setDebugLoc(DebugLoc);
+      }
+
+      idx++;
+    }
+
+    Instruction *NewHead = replaceAllocaWithFreeList(AllocaPtr, AllocaEnd, ItemGEPs[0], FreeList);
+
+    // Redirect list head by, adjusting listhead PHI Node in the loop entry.
+    ListHead->addIncoming(NewHead, NewHead->getParent());
+  }
+
+  /// Create a new return block which pulls from the list. It loops until the
+  /// list is empty. It returns if and only if the list is empty.
+  ///
+  /// This function splits the terminator instructions from the basic block
+  /// currently containing the remaining static instructions.
+  //
+  // H1:
+  //
+  //   %static1 = call @fn1, %movable1
+  //   call @recursive ...
+  //
+  //   %static2 = call @fn1, %movable2
+  //   call @recursive ...
+  //
+  //   br ret
+  //
+  // ->
+  //
+  // pull:
+  //   %listempty = icmp %listhead, null
+  //   br %listempty, %return, %H1
+  //
+  // H1:
+  //   %static1 = call @fn1, %movable1
+  //   call @recursive ...
+  //
+  //   %static2 = call @fn1, %movable2
+  //   call @recursive ...
+  //
+  //   br return
+  //
+  // return:
+  //   br ret
+  //
+  std::tuple<BasicBlock*, BasicBlock*, BasicBlock*>
+  createNewReturnBlock(BasicBlock *StaticInstBB, RecursiveCall *CC, PHINode *ListHead) {
+    CallInst *CI = CC->getCallInst();
+    const DebugLoc &DebugLoc = CI->getDebugLoc();
+
+    // If the block is a non-returning blocks then the following block is the a
+    // return only block and the branch is an unconditional one.
+    if (BranchInst *BI = dyn_cast<BranchInst>(StaticInstBB->getTerminator())) {
+      BasicBlock *RetOnlyBB = StaticInstBB->getSingleSuccessor();
+      assert(RetOnlyBB && "Recursion has more than one successor");
+      Instruction *NewTI = RetOnlyBB->getTerminator()->clone();
+      NewTI->insertBefore(BI);
+      NewTI->setDebugLoc(BI->getDebugLoc());
+      BI->eraseFromParent();
+    }
+
+    // Create the BB which pulls an item from the list.
+    BasicBlock *PullBB = BasicBlock::Create(C, "rse.pull", &F, StaticInstBB);
+
+    // Split off the terminator instruction. This becomes the returning
+    // basic block.
+    BasicBlock *NewReturnBB = StaticInstBB->splitBasicBlock(
+        StaticInstBB->getTerminator(), "rse.return");
+    NewReturnBB->getTerminator()->setDebugLoc(DebugLoc);
+
+    ICmpInst *CmpI = new ICmpInst(*PullBB,
+                                  ICmpInst::ICMP_EQ,
+                                  ListHead,
+                                  ConstantPointerNull::get(ListTy->getPointerTo()),
+                                  "rse.cmp_list_empty");
+    CmpI->setDebugLoc(DebugLoc);
+
+    BranchInst *BI = BranchInst::Create(NewReturnBB, StaticInstBB, CmpI, PullBB);
+    BI->setDebugLoc(DebugLoc);
+
+    return std::make_tuple(PullBB, StaticInstBB, NewReturnBB);
+  }
+
+  /// Transforms the path which does not return on the old return such that it
+  /// pick the information from the itemlist, and loops back to the loopentry
+  /// after executing the relevant parts of the function H.
+  //
+  // H1:
+  //   ; load data from list
+  //   %listitem = load %listhead
+  //
+  //   ; extract data from list (mark is omitted if necessary)
+  //   %next = extractvalue %listitem, 1, 0
+  //   ...
+  //
+  // -> [instructions are inserted below %next]
+  //
+  // H1:
+  //   ; load data from list
+  //   %listitem = load %listhead
+  //
+  //   ; extract data from list (mark is omitted if necessary)
+  //   %next = extractvalue %listitem, 1, 0
+  //   %mark = <instructions from mark algorithm>
+  //   br %mark, %addtofree, %H2
+  //
+  // addtofree:
+  //   %firstinblockptr = getelementptr %listty, %listhead, -1
+  //   %freenextptr = getelementptr %listty, %listhead, -1, 1, 0
+  //   store %freelist, %freenextptr
+  //   br %H2
+  //
+  // H2:
+  //   %newfree - phi [%freelist, %H1], [%firstinblock, %addtofree]
+  //
+  //   ...
+  //
+  void insertAddToFreeListBB(Value *ListItem, Instruction *NextPtr, PHINode *ListHead, PHINode *FreeList,
+                              const DebugLoc &DebugLoc) {
+    BasicBlock *H1 = NextPtr->getParent();
+    BasicBlock::iterator InsertPos(NextPtr);
+    ++InsertPos;
+    unsigned LinkItemIdx = 1;
+    Instruction *LinkItemVal = ExtractValueInst::Create(
+        ListItem, makeArrayRef(&LinkItemIdx, 1), "rse.linkval", &*InsertPos);
+    LinkItemVal->setDebugLoc(DebugLoc);
+    Value *Mark = Marker->addComputeMarkInst(LinkItemVal, ListHead, NextPtr, &*InsertPos, DebugLoc);
+
+    if (ConstantInt *MarkerConst = dyn_cast<ConstantInt>(Mark)) {
+      // Nothing to do marker is constant false
+      if (MarkerConst->isZero()) {
+        // Adjust Freelist PHI
+        FreeList->addIncoming(FreeList, InsertPos->getParent());
+        return;
+      }
+    }
+
+    // Compute address of bulk allocation.
+    signed IdxOfFirstItem = -(signed)Marker->getItemIndex(OtherCandidatesNumOf - 1);
+    //signed IdxOfFirstItem = -(signed)getItemIndex(OtherCandidatesNumOf - 1);
+    Value *IdxBulk[] = {
+      ConstantInt::get(DL.getIntPtrType(C, DL.getAllocaAddrSpace()), IdxOfFirstItem),
+    };
+    Instruction *NewFreeHead = GetElementPtrInst::CreateInBounds(ListTy,
+                                                                 ListHead,
+                                                                 makeArrayRef(IdxBulk, 1),
+                                                                 "rse.firstinblockptr",
+                                                                 &*InsertPos);
+    NewFreeHead->setDebugLoc(DebugLoc);
+
+    // Adjust Freelist PHI
+    FreeList->addIncoming(NewFreeHead, InsertPos->getParent());
+
+    // If freelist handling is conditional, surround the generating function by
+    // a basic block and execute it conditionally
+    if (Instruction *MarkerInst = dyn_cast<Instruction>(Mark)) {
+      // Split of a basic block and add a conditional branch instruction
+      BasicBlock *H2 = H1->splitBasicBlock(InsertPos, "rse.H2");
+      BasicBlock *AddToFreeBB = H1->splitBasicBlock(NewFreeHead, "rse.addtofree");
+
+      // Insert a branch depending on mark.
+      BranchInst *BI = BranchInst::Create(AddToFreeBB, H2, Mark);
+      BI->setDebugLoc(DebugLoc);
+      ReplaceInstWithInst(H1->getTerminator(), BI);
+
+      // Add a PHINode into H2
+      InsertPos = BasicBlock::iterator(H2->getFirstNonPHI());
+      PHINode *FreeNextPHI = PHINode::Create(ListTy->getPointerTo(), 2,
+                                             "rse.newfree.H",
+                                             &*InsertPos);
+      FreeNextPHI->setDebugLoc(DebugLoc);
+      NewFreeHead->replaceAllUsesWith(FreeNextPHI);
+      FreeNextPHI->addIncoming(FreeList, H1);
+      FreeNextPHI->addIncoming(NewFreeHead, AddToFreeBB);
+    }
+
+    // Store the old list head into the last element of the item. This is done
+    // after potentially splitting of the conditional branch, to avoid messing
+    // up with `NewFreeHead->replaceAllUsesWith(FreeNextPHI);`, we want to use
+    // this the original value here.
+    InsertPos = std::next(BasicBlock::iterator(NewFreeHead));
+    // Compute of last next ptr.
+    Value *IdxNext[] = {
+      ConstantInt::get(DL.getIntPtrType(C, DL.getAllocaAddrSpace()), Marker->getItemIndex(OtherCandidatesNumOf - 1)),
+      ConstantInt::get(IntegerType::get(C, 32), 1),
+      ConstantInt::get(IntegerType::get(C, 32), 0),
+    };
+    Instruction *FreeNext = GetElementPtrInst::CreateInBounds(ListTy,
+                                                              NewFreeHead,
+                                                              makeArrayRef(IdxNext, 3),
+                                                              "rse.freenextptr",
+                                                              &*InsertPos);
+    FreeNext->setDebugLoc(DebugLoc);
+
+    StoreInst *SI = new StoreInst(FreeList,
+                                  FreeNext,
+                                  false,
+                                  DL.getABITypeAlignment(ListTy->getPointerTo()),
+                                  &*InsertPos);
+    SI->setDebugLoc(DebugLoc);
+  }
+
+  /// Transforms the path which does not return on the old return such that it
+  /// pick the information from the itemlist, and loops back to the loopentry
+  /// after executing the relevant parts of the function H.
+  //
+  // pull:
+  //   %listempty = icmp %listhead, null
+  //   br %listempty, %return, %H1
+  //
+  // H1:
+  //   %static1 = call @fn1, %movable1
+  //   call @recursive ...
+  //
+  //   %static2 = call @fn1, %movable2
+  //   call @recursive ...
+  //
+  //   br return
+  //
+  // ->
+  //
+  // pull:
+  //   %listempty = icmp %listhead, null
+  //   br %listempty, %return, %H1
+  //
+  // H1:
+  //   ; load data from list
+  //   %listitem = load %listhead
+  //
+  //   ; extract data from list (mark is omitted if necessary)
+  //   %next = extractvalue %listitem, 1, 0
+  //   %mark = <mark instructions>
+  //   %operand.0 = extractvalue %listitem, 0, 0
+  //
+  //   ; execute function H but replace the recursive call by a branch into the loop entry
+  //   %static1 = call @fn1, %operand.0
+  //
+  //   ; Note, %static1 is used to adjust the corresponding argument phi.
+  //   br %loopentry
+  //
+  template <typename ArgumentPHIContainer>
+  void transformHExecution(
+      CandidateContainer &OtherCandidates,
+      BasicBlock *HExecuteBB,
+      PHINode *ListHead, PHINode *FreeList,
+      BasicBlock *LoopEntry,
+      const ArgumentPHIContainer &ArgumentPHIs) {
+
+    Instruction *InsertPos = HExecuteBB->getFirstNonPHIOrDbg();
+
+    // Split the old static instruction which will become unreachable. These are deleted later.
+    BasicBlock *StaticUnreachBB = HExecuteBB->splitBasicBlock(InsertPos, "rse.staticunreachable");
+    InsertPos = HExecuteBB->getTerminator();
+
+    RecursiveCall &CC = OtherCandidates.front();
+    const DebugLoc &DebugLoc = CC.getCallInst()->getDebugLoc();
+
+    // Load the list head content.
+    LoadInst *ListItem = new LoadInst(ListHead,
+                                      "rse.listitem",
+                                      false,
+                                      DL.getABITypeAlignment(ListTy),
+                                      InsertPos);
+    ListItem->setDebugLoc(DebugLoc);
+
+    // Extract the elements of the listitem to prepare the operands.
+    unsigned Idx[2] = { 1, 0 };
+
+    ExtractValueInst *nextPtr = ExtractValueInst::Create(
+        ListItem, makeArrayRef(Idx, 2), "rse.next", InsertPos);
+    nextPtr->setDebugLoc(DebugLoc);
+
+    unsigned IncomingCount = IncomingArguments.size();
+    SmallVector<Value*, 4> Incoming;
+    Incoming.reserve(IncomingCount);
+    Idx[0] = 0;
+
+    Idx[1] = 0;
+    for (IncomingArgument &IA : IncomingArguments) {
+      Value *V = IA.getValue();
+      if (IA.isConstant() || IA.isStatic()) {
+        Incoming.push_back(V);
+      }
+      else {
+        Value *EVI = ExtractValueInst::Create(
+            ListItem, makeArrayRef(Idx, 2), "rse.operand." + Twine(Idx[1]), InsertPos);
+        Incoming.push_back(EVI);
+        Idx[1]++;
+      }
+    }
+
+    // Clone instructions from function H back into the code sequence.
+    Function *FunctionH = CC.getFunctionH();
+    assert(FunctionH && "FunctionH not present.");
+
+    CallInst *CI = nullptr;
+    std::map<Value*, Value*> VVMap;
+    for (Instruction &I: FunctionH->getEntryBlock()) {
+      if (&I == FunctionH->getEntryBlock().getTerminator())
+        continue;
+
+      Instruction *NewI = I.clone();
+      VVMap[&I] = NewI;
+      NewI->insertBefore(InsertPos);
+      CI = dyn_cast<CallInst>(NewI);
+
+      for (Use &U: NewI->operands()) {
+        Value *V = U.get();
+
+        auto VVMI = VVMap.find(V);
+
+        // For Argument we replace the operand with the corresponding item in Incoming.
+        if (Argument *A = dyn_cast<Argument>(V)) {
+          U.set(Incoming[A->getArgNo()]);
+        }
+        // For Values within the Value-Value map use VVMap to resolve the relation.
+        else if (VVMI != VVMap.end()) {
+          U.set(VVMI->second);
+        }
+        // Otherwise it must be a constant that can be left unchanged.
+        else {
+          assert(isa<Constant>(V) && "could not transform. Not a constant");
+        }
+      }
+    }
+
+    assert(CI && CI->getCalledFunction() == &F && "Last instruction in FunctionH was not a recursive CallInst");
+
+    // Redirect Terminator.
+    BranchInst *BI = dyn_cast<BranchInst>(HExecuteBB->getTerminator());
+    assert(BI && BI->isUnconditional() && "Invalid Terminator in stackrecurse.");
+    BI->setSuccessor(0, LoopEntry);
+
+    // Adjust the List PHI Node in the loop entry.
+    ListHead->addIncoming(nextPtr, HExecuteBB);
+
+    // Adjust Argument PHI Nodes in the loop entry.
+    adjustPHIsFromCallInst(HExecuteBB, CI, ArgumentPHIs);
+
+    // Finally erase the CallInst.
+    CI->eraseFromParent();
+
+    // Insert Code for stack cleanup.
+    insertAddToFreeListBB(ListItem, nextPtr, ListHead, FreeList, DebugLoc);
+
+    // Remove the instructions which became unreachable.
+    for (auto &Cand: OtherCandidates) {
+      Cand.eraseSourceInstructions();
+    }
+
+    assert(&StaticUnreachBB->front() == StaticUnreachBB->getTerminator() && "Not all instructions have been removed");
+
+    StaticUnreachBB->eraseFromParent();
+  }
+
+  /// Redirect all returning blocks into the the new return block.
+  void redirectRemainingReturnBlocks(BasicBlock *PullBB, BasicBlock *NewReturnBB) {
+    for (auto &BB: F) {
+      if (&BB == NewReturnBB)
+        continue;
+
+      if (ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator())) {
+        BranchInst *BI = BranchInst::Create(PullBB, RI);
+        BI->setDebugLoc(RI->getDebugLoc());
+        RI->eraseFromParent();
+      }
+    }
+  }
+
+  /// Call the transformation functions above to apply the transformation.
+  void eliminateRecursion() {
+    SmallVector<PHINode *, 4> ArgumentPHIs;
+    PHINode *ListHead;
+    PHINode *FreeList;
+    BasicBlock *LoopEntry;
+
+    std::tie(ListHead, FreeList, LoopEntry) = prepareEntry(ArgumentPHIs);
+
+    BasicBlock *CallerBB, *StaticInstBB;
+    std::tie(CallerBB, StaticInstBB) =
+      eliminateFirstCall(FirstCandidate.front(), LoopEntry, ArgumentPHIs);
+
+    eliminateOtherCalls(
+        OtherCandidates,
+        CallerBB,
+        ListHead,
+        FreeList);
+
+    BasicBlock *PullBB;
+    BasicBlock *HExecuteBB;
+    BasicBlock *NewReturnBB;
+    std::tie(PullBB, HExecuteBB, NewReturnBB) =
+      createNewReturnBlock(StaticInstBB, &OtherCandidates.front(), ListHead);
+
+    transformHExecution(
+        OtherCandidates,
+        HExecuteBB, ListHead, FreeList, LoopEntry, ArgumentPHIs);
+
+    redirectRemainingReturnBlocks(PullBB, NewReturnBB);
+
+    for (PHINode *PN : ArgumentPHIs) {
+      // If the PHI Node is a dynamic constant, replace it with the value it is.
+      if (Value *PNV = SimplifyInstruction(PN, F.getParent()->getDataLayout())) {
+        PN->replaceAllUsesWith(PNV);
+        PN->eraseFromParent();
+      }
+    }
+
+    if (Value *PNV = SimplifyInstruction(FreeList, F.getParent()->getDataLayout())) {
+      FreeList->replaceAllUsesWith(PNV);
+      FreeList->eraseFromParent();
+    }
+
+    ORE.emit([&]() {
+        return  OptimizationRemark(DEBUG_TYPE, "Recursion eliminated",
+                                   F.getEntryBlock().front().getDebugLoc(), &F.getEntryBlock())
+          << "Stack modeled with: " << ore::NV("Width", OtherCandidatesNumOf + 1);
+      });
+  }
+};
+
+/// Wrapper class for legacy pass manager.
+struct RecursionStackElimination : public FunctionPass {
+  static char ID;
+  RecursionStackElimination() : FunctionPass(ID) {
+    initializeRecursionStackEliminationPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (DisableRecursionStackElimination)
+      return false;
+
+    if (skipFunction(F))
+      return false;
+
+    return RecursionStackEliminationImpl(
+        F,
+        getAnalysis<AAResultsWrapperPass>().getAAResults(),
+        getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE()).run();
+  }
+};
+}
+
+char RecursionStackElimination::ID = 0;
+INITIALIZE_PASS_BEGIN(RecursionStackElimination, DEBUG_TYPE, "Recursion Stack Elimination",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+INITIALIZE_PASS_END(RecursionStackElimination, DEBUG_TYPE, "Recursion Stack Elimination",
+                    false, false)
+
+/// Public interface to the RecursionStackEliminationination pass.
+FunctionPass *llvm::createRecursionStackEliminationPass() {
+  return new RecursionStackElimination();
+}
+
+/// Wrapper class for experimental pass manager.
+PreservedAnalyses RecursionStackEliminationPass::run(Function &F,
+                                                     FunctionAnalysisManager &AM) {
+  if (DisableRecursionStackElimination)
+    return PreservedAnalyses::all();
+
+  bool Changed = RecursionStackEliminationImpl(
+      F,
+      AM.getResult<AAManager>(F),
+      AM.getResult<OptimizationRemarkEmitterAnalysis>(F)).run();
+
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserve<GlobalsAA>();
+
+  return PA;
+}
Index: lib/Transforms/Scalar/Scalar.cpp
===================================================================
--- lib/Transforms/Scalar/Scalar.cpp
+++ lib/Transforms/Scalar/Scalar.cpp
@@ -93,6 +93,7 @@
   initializeSimpleLoopUnswitchLegacyPassPass(Registry);
   initializeSinkingLegacyPassPass(Registry);
   initializeTailCallElimPass(Registry);
+  initializeRecursionStackEliminationPass(Registry);
   initializeSeparateConstOffsetFromGEPPass(Registry);
   initializeSpeculativeExecutionLegacyPassPass(Registry);
   initializeStraightLineStrengthReducePass(Registry);
@@ -236,6 +237,10 @@
   unwrap(PM)->add(createTailCallEliminationPass());
 }
 
+void LLVMAddRecursionStackEliminationinationPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createRecursionStackEliminationPass());
+}
+
 void LLVMAddConstantPropagationPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createConstantPropagationPass());
 }