Index: include/llvm-c/Transforms/Scalar.h
===================================================================
--- include/llvm-c/Transforms/Scalar.h
+++ include/llvm-c/Transforms/Scalar.h
@@ -123,6 +123,9 @@
 /** See llvm::createTailCallEliminationPass function. */
 void LLVMAddTailCallEliminationPass(LLVMPassManagerRef PM);
 
+/** See llvm::createMultiTailCallEliminationPass function. */
+void LLVMAddMultiTailCallEliminationPass(LLVMPassManagerRef PM);
+
 /** See llvm::createConstantPropagationPass function. */
 void LLVMAddConstantPropagationPass(LLVMPassManagerRef PM);
 
Index: include/llvm/InitializePasses.h
===================================================================
--- include/llvm/InitializePasses.h
+++ include/llvm/InitializePasses.h
@@ -385,6 +385,7 @@
 void initializeStripSymbolsPass(PassRegistry&);
 void initializeStructurizeCFGPass(PassRegistry&);
 void initializeTailCallElimPass(PassRegistry&);
+void initializeMultiTailCallEliminationPass(PassRegistry&);
 void initializeTailDuplicatePass(PassRegistry&);
 void initializeTargetLibraryInfoWrapperPassPass(PassRegistry&);
 void initializeTargetPassConfigPass(PassRegistry&);
Index: include/llvm/Transforms/Scalar.h
===================================================================
--- include/llvm/Transforms/Scalar.h
+++ include/llvm/Transforms/Scalar.h
@@ -280,6 +280,12 @@
 
 //===----------------------------------------------------------------------===//
 //
+// MultiTailCallElimination - This pass eliminates multiple tail calls.
+//
+FunctionPass *createMultiTailCallEliminationPass();
+
+//===----------------------------------------------------------------------===//
+//
 // EarlyCSE - This pass performs a simple and fast CSE pass over the dominator
 // tree.
 //
Index: include/llvm/Transforms/Scalar/MultiTailCallElimination.h
===================================================================
--- /dev/null
+++ include/llvm/Transforms/Scalar/MultiTailCallElimination.h
@@ -0,0 +1,30 @@
+//===---- MultiTailCallElimination.h ----------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass converts multiple tail recursive calls into a loop, by modeling the
+// calls as a single linked worklist explicitly on the stack.
+//
+// See MultiTailCallElimination.cpp for more details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_SCALAR_RECURSIONSTACKELIMINATION_H
+#define LLVM_TRANSFORMS_SCALAR_RECURSIONSTACKELIMINATION_H
+
+#include "llvm/IR/Function.h"
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+struct MultiTailCallEliminationPass : PassInfoMixin<MultiTailCallEliminationPass> {
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+}
+
+#endif // LLVM_TRANSFORMS_SCALAR_RECURSIONSTACKELIMINATION_H
Index: lib/Passes/PassBuilder.cpp
===================================================================
--- lib/Passes/PassBuilder.cpp
+++ lib/Passes/PassBuilder.cpp
@@ -133,6 +133,7 @@
 #include "llvm/Transforms/Scalar/LowerGuardIntrinsic.h"
 #include "llvm/Transforms/Scalar/MemCpyOptimizer.h"
 #include "llvm/Transforms/Scalar/MergedLoadStoreMotion.h"
+#include "llvm/Transforms/Scalar/MultiTailCallElimination.h"
 #include "llvm/Transforms/Scalar/NaryReassociate.h"
 #include "llvm/Transforms/Scalar/NewGVN.h"
 #include "llvm/Transforms/Scalar/PartiallyInlineLibCalls.h"
@@ -359,6 +360,14 @@
   assert(Level != O0 && "Must request optimizations!");
   FunctionPassManager FPM(DebugLogging);
 
+  if (Level == O3) {
+    FPM.addPass(SROA());                // Try to get rid of allocas
+    FPM.addPass(MultiTailCallEliminationPass());
+    FPM.addPass(VerifierPass());
+    FPM.addPass(ADCEPass());            // Delete dead instructions
+    FPM.addPass(SimplifyCFGPass());     // Merge & remove BBs
+  }
+
   // Form SSA out of local memory accesses after breaking apart aggregates into
   // scalars.
   FPM.addPass(SROA());
Index: lib/Transforms/IPO/PassManagerBuilder.cpp
===================================================================
--- lib/Transforms/IPO/PassManagerBuilder.cpp
+++ lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -543,6 +543,15 @@
     MPM.add(createArgumentPromotionPass()); // Scalarize uninlined fn args
 
   addExtensionsToPM(EP_CGSCCOptimizerLate, MPM);
+
+  if ((SizeLevel == 0) && (OptLevel > 2)) {
+    MPM.add(createSROAPass());
+    MPM.add(createCFGSimplificationPass());     // Merge & remove BBs
+    MPM.add(createMultiTailCallEliminationPass());
+    MPM.add(createVerifierPass());
+    MPM.add(createAggressiveDCEPass());         // Delete dead instructions
+  }
+
   addFunctionSimplificationPasses(MPM);
 
   // FIXME: This is a HACK! The inliner pass above implicitly creates a CGSCC
Index: lib/Transforms/Scalar/CMakeLists.txt
===================================================================
--- lib/Transforms/Scalar/CMakeLists.txt
+++ lib/Transforms/Scalar/CMakeLists.txt
@@ -48,6 +48,7 @@
   MemCpyOptimizer.cpp
   MergeICmps.cpp
   MergedLoadStoreMotion.cpp
+  MultiTailCallElimination.cpp
   NaryReassociate.cpp
   NewGVN.cpp
   PartiallyInlineLibCalls.cpp
Index: lib/Transforms/Scalar/MultiTailCallElimination.cpp
===================================================================
--- /dev/null
+++ lib/Transforms/Scalar/MultiTailCallElimination.cpp
@@ -0,0 +1,2472 @@
+//===- MultiTailCallElimination.cpp - Eliminate multiple tail calls -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass converts multiple tail recursive calls into a loop, by modeling the
+// calls as a single linked worklist explicitly on the stack.
+//
+// void f(a, b, const c) {
+//   ...
+//   if (...)
+//     return
+//
+//   a1, b1, b2, b3, x2, x3 = ...
+//
+//   f(a1, b1, c) // 1st recursion
+//   a2 = h(x2)
+//   f(a2, b2, c) // 2nd recursion
+//   a3 = h(x3)
+//   f(a3, b3, c) // 3rd recursion
+// }
+//
+// ->
+//
+// void f(a, b, const c) {
+//   struct { x, b, next } *worklist = null
+//
+// loop:
+//   ...
+//   if (...)
+//     goto check_return
+//
+//   a1, b1, b2, b3, x2, x3 = ...
+//
+//   /* Assign arguments for the first call */
+//
+//   a = a1
+//   b = b1
+//
+//   /* Put arguments of the remaining calls into the work list */
+//
+//   queue = alloca(2 * sizeof(*worklist))
+//   queue[0] = {x2, b2, &queue[1]}
+//   queue[1] = {x3, b3, worklist}
+//   worklist = queue
+//
+//   goto loop
+//
+// check_return:
+//   if (!worklist)
+//     return
+//   a = h(worklist->x)
+//   b = worklist->b
+//   worklist = worklist->next
+//
+//   goto loop
+// }
+//
+// Such patterns occur for example when an application traverses k-ary full trees.
+//
+// The benefits of this transformation is, that neither frame nor link address
+// have to be stored on the stack. Also pass through arguments, like 'const c'
+// in example above, are less likely required to saved onto the stack, and if so
+// less often (only once in the entry block).
+//
+// The downsides are:
+//
+// a) An additional store for the worklist is required.
+//
+// b) The worst case required stack memory after this transformation depends on
+//    the number of recursive calls, instead of the recursion depth.
+//
+// c) Additional conditional branches
+//
+// ad a) This point is compensated by avoiding storing the link address in each
+//   call.
+//
+// ad b) This pass additionally can add (and does it by default) code to manage
+//   unused blocks of allocas in a freelist. Because allocations are done in
+//   blocks the freelist management, can also be done in blocks. This requires
+//   the last item in each block to be marked, detectable by the code within
+//   return.
+//
+//   Free list management introduces the following code:
+//
+//     1) Upon entry a second variable is added just like the worklist.
+//
+//         struct { x, b, next } *freelist = null
+//
+//     2) The alloca instruction is surrounded by a conditional
+//
+//         if (freelist != nullptr) {
+//           queue = freelist
+//           freelist = freelist[<lastitemindex>].next;
+//         }
+//         else {
+//           queue = alloca(2 * sizeof(*worklist))
+//         }
+//
+//     3) The return path in check return is modified dependent on the marker
+//        algorithm (see below)
+//
+//   Currently the following mark algorithms are implemented:
+//
+//   A further variable is stored on the stack containing the current
+//   freelist. On the other hand by not executing a recursion the frame pointer
+//   loads and stores are omitted.
+//
+//   * TrueMarker: This algorithm marks each element and is applicable to binary
+//     recursions, such as quick sort.
+//
+//     The following return path is generated:
+//
+//     check_return:
+//       if (!worklist)
+//         return
+//
+//       struct { x, b, next } nextitem = *worklist
+//       worklist[<lastitemindex>] = freelist
+//       freelist = worklist
+//
+//       a = h(nextitem.x)
+//       b = nextitem.b
+//       worklist = nextitem.next
+//
+//       goto loop
+//
+//   * TaggedMarker: (not yet implemented) Similar to the FieldMarker this
+//     marker marks the last item by a bit. But instead of using a separate bit
+//     it uses Bit 0 of the worklist field. If the alignment of the worklist is
+//     a power of 2, and if it is >= 2, this marker can also cover the general
+//     case. It requires some additional bit masking but no additional memory
+//     operations.
+//
+//     The following return path is generated:
+//
+//     check_return:
+//       struct { x, b, next } nextitem;
+//
+//       if (bit_0_is_set(worklist)) {
+//         nextitem = *mask_bit_0(worklist)
+//         goto pre_loop
+//       }
+//
+//       if (!worklist)
+//         return
+//
+//       nextitem = *worklist
+//       worklist[<lastitemindex>] = freelist
+//       freelist = worklist
+//
+//    pre_loop:
+//       a = h(nextitem.x)
+//       b = nextitem.b
+//       worklist = nextitem.next
+//       goto loop
+//
+//   * FalseMarker: This algorithm marks no element, and is applicable when
+//     freelist management is disabled. Using this marker implicitly unmarks all
+//     items. This leads to the same code as described in the introduction.
+//
+// Additionally the following Marker algorithms have been tested but have been
+// dropped.
+//
+//   * FieldMarker: A separate bit is allocated in the worklist. This marker
+//     requires additional instructions but can be used in the general case.
+//     This marker is inferior to the tagged marker.
+//
+//   * CompareMarker: Assuming allocas return memory addresses in a strictly
+//     monotonic order. The freelist can be modeled to return the same order
+//     when pulling elements from it. Comparing each worklist with worklist.next
+//     can then reveal the information if the element is marked. The performance
+//     is similar to the tagged marker but it requires an order for the returned
+//     pointers of alloca. LLVM does not guarantee this order in general.
+//
+// ad c) The pass adds 1 conditional branch into the return path and 1
+//   additional branches for freelist management (see (b) above). Depending on
+//   the target, machine branch prediction can elevate this.
+//
+// List organization
+// -----------------
+//
+// Suppose a recursion the calls itself 4 times. Fig. 1 depicts a possible list
+// state when executing the Step 3.3. Column 1 is a representation of the
+// parameters. Column 2 denotes the information whether an item is marked and
+// the third column shows the link information of the next pointer.
+//
+// Worklist always points the the first item to execute after the current
+// item. One see that worklist already points the Step 3.4 while executing Step
+// 3.3.
+//
+// Fig. 1: Work-List and free-List while executing Step 3.3. Note that work-list
+// already points to Step 3.4 even when currently executing Step 3.3.
+//
+// +----------+---+-----+
+// | Step 2   |   | *   |
+// +----------+---+ | --+       +----------+---+-----+
+// | Step 3   |   | v * |       | Step 3.2 |   | *   |
+// +----------+---+-- | |       +----------+---+ | --+
+// | Step 4   | M | * v | <-\   | Step 3.3 |   | v * | [currently executing]
+// +----------+---+-|---+   |   +----------+---+-- | +
+//                  v       |   | Step 3.4 | M | * v | <- [worklist]
+//               nullptr    |   +----------+---+ | --+
+//                          |                    |
+//                          \--------------------/
+//
+//                              +----------+---+-----+
+//                              |          |   | *   | <- [freelist]
+//                              +----------+---+ | --+
+//                              |          |   | v * |
+//                              +----------+---+-- | +
+//                              |          | M | * v |
+//                              +----------+---+ | --+
+//                                               v
+//                                            nullptr
+//
+// The state while executing Step 3.4 might look like depicted in Fig. 2. The
+// transformation of the state is done within the return path.
+//
+// Fig. 2: Work-list and free-list while executing Step 3.4.
+//
+// +----------+---+-----+
+// | Step 2   |   | *   |
+// +----------+---+ | --+       +----------+---+-----+
+// | Step 3   |   | v * |       | Step 3.2 |   | *   | <- [free-list]
+// +----------+---+-- | |       +----------+---+ | --+
+// | Step 4   | M | * v | <-\   | Step 3.3 |   | v * |
+// +----------+---+-|---+   |   +----------+---+-- | +
+//                  v       |   | Step 3.4 | M | * v | [currently executing]
+//               nullptr    |   +----------+---+ | --+
+//                          |                    |
+//                          \--- [work-list]     \-\
+//                                                 |
+//                              +----------+---+-- | +
+//                              |          |   | * v |
+//                              +----------+---+ | --+
+//                              |          |   | v * |
+//                              +----------+---+-- | +
+//                              |          | M | * v |
+//                              +----------+---+ | --+
+//                                               v
+//                                            nullptr
+//
+// Algorithm outline:
+// ------------------
+//
+// Analysis Phase:
+//
+// 1) Analyze the function and gather the returning basic blocks (BB) (and BB
+//    branching to return-only BB) with recursive calls (RC).
+//
+// 2) If more the one BB or no RC is found abandon the transformation.
+//
+// 3) If the remaining BB has only one RC abandon the transformation. Otherwise
+//    let N be number of RC in this BB.
+//
+// 4) Analyze the instructions in this BB and from the first RC until the
+//    terminator instruction and classify each instruction as movable and static.
+//
+//    A movable instruction is and instruction that can be safely moved before
+//    the first RC. All other instructions are classified static.
+//
+// 5) Assign each static instruction to the following RC instruction. If static
+//    instructions are left after the last RC abandon the transformation.
+//
+// 6) Build the function H with all its arguments for each RC. By including the
+//    call itself in function H it is ensured that this function and enforcing
+//    this function to return void. It is ensured that there are no escaping
+//    values uses after the recursion.
+//
+// 6.1) Note: By the way step 4 is executed it is guaranteed that function H for
+//   the first RC consists of a single instruction; the call itself. The first
+//   call candidate is handled special (the same way as in Tail Recursion
+//   Elimination (TRE)).
+//
+// 5,6) Note: The information collected on each RC is collected in the structure
+//   RecursiveCall.
+//
+// 7) Compare the second's function H with all later ones. The behavior must
+//    match, otherwise abandon the transformation.
+//
+// 7.1) Note: As the first RCs function H basically a TRE it can be ignored in this
+//    step.
+//
+// 8) Simplify the argument list by removing constants and pass through arguments.
+//
+// 9) Decide whether it is profitable to use the transformation.
+//
+// Transformation Phase:
+//
+// 1) Adjust entry block and split of the loop entry.
+//
+// 2) Eliminate the first RC (similar to TRE).
+//
+// 3) Eliminate the remaining RC by allocating and filling an array new (or pick
+//    it from the freelist) block of N-1 struct items. This array is put in the
+//    front of the list. Pulling the list is added in (4). The execution of
+//    function H is ensured in (5).
+//
+// 4) Create a new return block which pulls items from the worklist. If an and
+//    of block marker is reached. The block is put into the freelist.
+//
+// 4.1) Add the instruction from function H into the return block and create the
+//      loop.
+//
+// 5) Redirect each returning block into the new return block created in (4).
+//
+// Open issues and known TODOs - It would be great if reviewer could comment on
+// those as well:
+//
+// 1) Pipeline integration: Currently the pass is put before TRE. This includes
+//    some supporting passes, for cleanup and preparation. This cannot be left
+//    as is. The preferred way could be by adjusting
+//    "AArch64TargetMachine::adjustPassManager" and only use it with
+//    O3. AAarch64 is selected, because this is the architecture the author has
+//    suitable benchmarking and test setups available. This was tried once (in a
+//    similar way as in AMDGPUTargetMachine::adjustPassManager), however the
+//    result was that may LLVM projects did not compile anymore because of
+//    linker problems (Passes library was missing). Do you have any advise here?
+//
+// 2) The way to test if it profitable to use the pass needs adjustment. I think
+//    that functions, that spill more registers have an increased chance to
+//    profit, while functions that spill less, have lower chance for profit.
+//
+// 4) Is it safe to temporary create function and return no-changes after
+//    deleting it? If not is there a better way to than calling using the
+//    FunctionComparator?
+//
+// 5) GlobalsAA needs to be preserved. Not sure about this in this context. Loads
+//    and Stores are added here.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/MultiTailCallElimination.h"
+#include "llvm/Pass.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/FunctionComparator.h"
+
+#include <cassert>
+#include <forward_list>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "multi-tce"
+
+namespace {
+
+/// Option to disable the pass.
+static cl::opt<bool>
+DisableMultiTailCallElimination("disable-" DEBUG_TYPE, cl::init(false),
+                                 cl::Hidden, cl::desc("Disable multi tail call eliminiation"));
+
+/// Option to disable freelist handling.
+static cl::opt<bool>
+DisableFreeList("disable-" DEBUG_TYPE "-freelist", cl::init(false),
+                    cl::Hidden, cl::desc("Disable freelists in multi tail call eliminiation"));
+
+/// Local implementation class.
+struct MultiTailCallEliminationImpl {
+  MultiTailCallEliminationImpl(
+      Function &F, AliasAnalysis &AA, TargetTransformInfo &TTI,
+      OptimizationRemarkEmitter &ORE) :
+      DL(F.getParent()->getDataLayout()),
+      C(F.getParent()->getContext()),
+      TTI(TTI),
+      ORE(ORE),
+      F(F),
+      AA(AA) {}
+
+  bool run() {
+    // Check if feasible and profitable,
+    if (analyzeFunction()) {
+      // then eliminate the tail calls.
+      eliminateTailCalls();
+      return true;
+    }
+
+    return false;
+  }
+
+private:
+  /// Helper class to collect information on a single recursive call site. This
+  /// class also stores the required properties of the function H (see
+  /// introduction's description of this pass).
+  struct RecursiveCall {
+  private:
+    /// Hide copy constructor
+    RecursiveCall(const RecursiveCall &O) : Index(O.Index), F(O.F), H(O.H), CI(O.CI), Is() {
+      llvm_unreachable("RecursiveCall cannot be copied");
+    }
+
+  public:
+    RecursiveCall(CallInst *CI, size_t Index) : Index(Index), F(*CI->getParent()->getParent()), H(nullptr), CI(CI), Is() {}
+
+    ~RecursiveCall() {
+      // If function H was already created, delete it. It is only used temporary.
+      if (H)
+        H->eraseFromParent();
+    }
+
+    /// Add an instruction to the list representing the function H.
+    void addInstruction(Instruction *I) {
+      Is.push_back(I);
+    }
+
+    /// Returns the recursive call instruction.
+    CallInst *getCallInst() const {
+      return CI;
+    }
+
+    /// Test if the value V is part of function H. Because H is not created these
+    /// instructions are denoted "local"
+    bool isLocalValue(Value *V, unsigned *Idx = nullptr) {
+      auto LocalInstructionIt = std::find(Is.begin(), Is.end(), V);
+
+      if (LocalInstructionIt == Is.end())
+        return false;
+
+      if (Idx)
+        *Idx = LocalInstructionIt - Is.begin();
+
+      return true;
+    }
+
+    /// Get the list operands of that are required for calling the function H
+    const std::vector<Use*> &getIncomingOperands() const {
+      return IncomingOperands;
+    }
+
+    /// If possible create the function H (including the final recursive call
+    /// instruction). If it succeeds buildFunctionH returns true and assigns it to
+    /// the member variable H. The created function is purely temporary. Its
+    /// lifetime is bound to this object. The created function always returns void.
+    bool buildFunctionH() {
+      if (H)
+        H->eraseFromParent();
+      H = nullptr;
+
+      // Get the arguments list from H
+      SmallVector<Type*, 4> HArgs;
+      for (auto &U: IncomingOperands)
+        HArgs.push_back(U->get()->getType());
+
+      // Get the function type and create it
+      FunctionType *HTy = FunctionType::get(Type::getVoidTy(F.getContext()), makeArrayRef(HArgs), F.isVarArg());
+      H = Function::Create(HTy, GlobalValue::PrivateLinkage, F.getAddressSpace(), F.getName() + ".H" + Twine(Index), F.getParent());
+      BasicBlock *NewBB = BasicBlock::Create(H->getContext(), "entry", H);
+
+      // Value-Value map to match instruction operands between the source and clone.
+      std::map<Value*, Value*> VVMap;
+
+      // Define a lambda used to clone an instruction. If leave constants is true
+      // those are taken from the argument list if possible.
+      auto cloneAndAppendInst = [this, &VVMap, NewBB] (Instruction *I, bool leaveConstants) {
+        // We do not allow any uses outside the newly created function. Otherwise
+        // the function would return some non void value.
+        for (Use &U : I->uses()) {
+          if (!isLocalValue(U.getUser()) && U.getUser() != CI) {
+            H->eraseFromParent();
+            H = nullptr;
+            return false;
+          }
+        }
+
+        // Clone the instruction, assign it to the newly created function and
+        // adjust all operands.
+        Instruction *NewI = I->clone();
+        VVMap[I] = NewI;
+        NewBB->getInstList().push_back(NewI);
+        for (Use &U: NewI->operands()) {
+          // Keep constant operands as is.
+          if (leaveConstants && isa<Constant>(U.get()))
+            continue;
+
+          // Replace other operands by their counter part in the new function
+          if (isLocalValue(U.get())) {
+            auto VVMI = VVMap.find(U.get());
+            if (VVMI == VVMap.end()) {
+              H->eraseFromParent();
+              H = nullptr;
+              return false;
+            }
+
+            U.set(VVMI->second);
+          }
+          else {
+            auto IOI = std::find_if(IncomingOperands.begin(), IncomingOperands.end(), [&U] (const Use* S) -> bool {
+                return S->get() == U.get();
+              });
+
+            // If the operand is not found, ignore it if and only if it is a constant.
+            if (IOI == IncomingOperands.end() && isa<Constant>(U.get()))
+              continue;
+            assert(IOI != IncomingOperands.end() && "Operand not in argument list");
+            size_t ArgIdx = IOI - IncomingOperands.begin();
+            U.set(H->arg_begin() + ArgIdx);
+          }
+        }
+
+        return true;
+      };
+      // EOL - Lambda cloneAndAppendInst
+
+      // Clone all instructions into the new function,
+      for (auto &I: Is) {
+        if (!cloneAndAppendInst(I, true))
+          return false;
+      }
+      // but allow constant arguments for the recursive call inst.
+      if (!cloneAndAppendInst(CI, false))
+        return false;
+
+      // Finally, append the terminator instruction.
+      ReturnInst::Create(H->getContext(), NewBB);
+
+      return true;
+    }
+
+    /// Return function H and create it if necessary.
+    Function *getFunctionH() {
+      if (!H) {
+        buildFunctionH();
+      }
+      return H;
+    }
+
+    /// Remove duplicates within the operands while preserving the order of each
+    /// first occurrence. The operands a given as use. For duplicates and
+    /// arbitrary use is selected to be kept.
+    void cleanupIncomingOperands() {
+      std::map<Value*, Use*> map;
+      std::vector<Use*> tmp(IncomingOperands.begin(), IncomingOperands.end());
+      IncomingOperands.clear();
+      for (auto &O: tmp) {
+        bool inserted;
+        std::tie(std::ignore, inserted) = map.insert(std::make_pair(O->get(), O));
+        if (inserted)
+          IncomingOperands.push_back(O);
+      }
+    }
+
+    /// Collect all required operand for instruction I. If allowConstants is set,
+    /// Constants are also put on the operand list.
+    void analyzeOperands(User::op_range Operands, Instruction *I, bool allowConstants) {
+      for (auto &U: Operands) {
+        if (!allowConstants && isa<Constant>(U.get()))
+          continue;
+
+        // If the operand is a non-local value add it to the list. This covers all
+        // movable instructions, as well as static instructions of other previous
+        // candidates. For movable instructions this is feasible because they
+        // instructions will remain (moved). On the other hand static instruction
+        // are removed. Referring to those is invalid. An example is depicted in
+        // the sequence below.
+        //
+        // ; Candidate 0
+        // call @recursive(%donotcare)
+        // ; ...
+        // ;
+        // ; Candidate n
+        // %a = load %something
+        // %b = load %somethingelse
+        // call @recursive(%a)
+        // ; ...
+        // ;
+        // ; Candidate m > n
+        // call @recursive(%b)
+        //
+        // This case is not detected within this call. However, because the
+        // function H of candidate n would have to return a non-void value, it is
+        // covered during buildFunctionH.
+        if (!isLocalValue(U.get())) {
+          IncomingOperands.push_back(&U);
+        }
+      }
+    }
+
+    /// Analyze if this candidate can be feasible for this transformation.
+    bool analyze() {
+      IncomingOperands.clear();
+      for (auto &I: Is) {
+        // In general it is not safe to replace a constant by an arbitrary value
+        // in the LLVM IR. allowConstants is there for set to false to avoid
+        // putting them onto the argument list.
+        analyzeOperands(I->operands(), I, false);
+      }
+      // As we know that all candidates are recursive only arg_operands are
+      // analyzed for the recursive call instruction. Also for CallInst
+      // arg_operands it is feasible and potentially profitable to replace
+      // constants with values. E.g.:
+      //
+      // call @recursive(%this, 0);
+      // call @recursive(%this, 1);
+      // ...
+      // call @recursive(%this, n);
+      // ret
+      analyzeOperands(CI->arg_operands(), CI, true);
+
+      // Remove duplicates from operand list.
+      cleanupIncomingOperands();
+
+      // Try to encapsulate call and all static instruction into an new separate function.
+      if (!buildFunctionH())
+        return false;
+
+      return true;
+    }
+
+    /// Test if O's function H behaves equally compared to the own function H.
+    bool compareFunctionH(RecursiveCall *O) const {
+      GlobalNumberState GN;
+      return FunctionComparator(H, O->H, &GN).compare() == 0;
+    }
+
+    /// Erase the CallInst from the source function.
+    void eraseSourceCallInst() {
+      CI->eraseFromParent();
+      CI = nullptr;
+    }
+
+    /// Erase all static instructions from the source function.
+    void eraseSourceInstructions() {
+      eraseSourceCallInst();
+      for (auto i = Is.rbegin(), e = Is.rend(); i != e; /* inloop */) {
+        (*(i++))->eraseFromParent();
+      }
+      Is.clear();
+    }
+
+    size_t getIndex() const {
+      return Index;
+    }
+
+  private:
+    /// Index, solely used for naming
+    size_t Index;
+
+    /// Recursive function the candidate is called within
+    Function &F;
+
+    /// Temporary function H (including the recursive call). This function is used
+    /// during analysis and transformation. It is never called and destroyed when
+    /// this object is deleted.
+    Function *H;
+
+    /// Recursive call instruction.
+    CallInst *CI;
+
+    /// Instructions that precede the current call.
+    SmallVector<Instruction*, 4> Is;
+    std::vector<Use*> IncomingOperands;
+  };
+
+  /// Helper class to collect and classify used arguments and analyze arguments
+  /// used by function H.
+  struct IncomingArgument {
+    /// Stacked arguments are arguments that need to be modeled on the stack
+    ///
+    /// Static arguments are arguments which are passed through to all
+    /// recursive calls without changing position nor value.
+    ///
+    /// e.g.:
+    ///   fn(a, b, c) {
+    ///    fn(a, b, c);
+    ///    fn(a + 1, a, c);
+    /// }
+    /// => StaticArguments = {c}
+    ///
+    /// Constant arguments are Constants that are the equal for each call except the first.
+    ///
+    /// UnknownKind are arguments which are not yet classified.
+    enum Kind {
+      StackedKind,
+      StaticKind,
+      ConstantKind,
+      UnknownKind,
+    };
+
+    IncomingArgument() : Uses(), K(UnknownKind) {}
+
+    /// Add an argument use and update the current classification.
+    void addUse(const RecursiveCall *CC, Use *U, bool CheckOnly = false) {
+      Value *V = U->get();
+      switch (K) {
+      case UnknownKind:
+        K = StackedKind;
+        if (isa<Constant>(V)) {
+          K = ConstantKind;
+        }
+        else if (Argument *A = dyn_cast<Argument>(V)) {
+          if (A == CC->getCallInst()->getArgOperand(A->getArgNo())) {
+            K = StaticKind;
+          }
+        }
+        Ty = V->getType();
+        break;
+
+      case StackedKind:
+        // Keep Stacked as Stacked.
+        break;
+
+      case ConstantKind:
+        // Change kind to stacked if the new argument is no constant or if the
+        // constant value differs.
+        if (Constant *C = dyn_cast<Constant>(V)) {
+          if (C != getValue())
+            K = StackedKind;
+        }
+        else {
+          K = StackedKind;
+        }
+        break;
+
+      case StaticKind:
+        // Change kind to stacked if the new argument is no argument or if the
+        // the position in the corresponding CallInst is unsuitable.
+        if (Argument *A = dyn_cast<Argument>(V)) {
+          if (A != CC->getCallInst()->getArgOperand(A->getArgNo())) {
+            K = StackedKind;
+          }
+        }
+        else {
+          K = StackedKind;
+        }
+        break;
+      }
+      assert(Ty == V->getType() && "Argument type mismatch");
+      if (!CheckOnly)
+        Uses.insert(std::make_pair(CC, U));
+    }
+
+    /// Get the used value of a specific candidate, or the used value of any
+    /// candidate.
+    Value *getValue(const RecursiveCall *CC = nullptr) const {
+      if (CC)
+        return Uses.find(CC)->second->get();
+      else
+        return Uses.begin()->second->get();
+    }
+
+    /// Get the type of this argument.
+    Type *getType() const {
+      return Ty;
+    }
+
+    /// Returns true it this argument is of stacked kind.
+    bool isStacked() const {
+      assert(K != UnknownKind && "Argument kind is unknown");
+      return K == StackedKind;
+    }
+
+    /// Returns true it this argument is of static kind.
+    bool isStatic() const {
+      assert(K != UnknownKind && "Argument kind is unknown");
+      return K == StaticKind;
+    }
+
+    /// Returns true it this argument is of constant kind.
+    bool isConstant() const {
+      assert(K != UnknownKind && "Argument kind is unknown");
+      return K == ConstantKind;
+    }
+
+  private:
+    /// Candidate-Use map of the argument.
+    std::map<const RecursiveCall *, Use *> Uses;
+
+    /// Classification.
+    Kind K;
+
+    /// Argument type.
+    Type *Ty;
+  };
+
+  struct TrueMarker {
+    using ItemListContainer = SmallVectorImpl<Type*>;
+
+    TrueMarker(LLVMContext &C, unsigned OtherCandidatesNumOf) : C(C), OtherCandidatesNumOf(OtherCandidatesNumOf) {}
+    virtual ~TrueMarker() {}
+
+    virtual int computeAllocaCost(StructType *ListTy, int Factor, TargetTransformInfo &TTI, const DataLayout &DL) {
+      // We factor in the stores to create the inner links
+      return Factor * (OtherCandidatesNumOf - 1) * TTI.getMemoryOpCost(
+          Instruction::Store,
+          // Pointer to next item
+          ListTy->getElementType(1)->getPointerTo(),
+          DL.getABITypeAlignment(ListTy->getElementType(1)),
+          DL.getAllocaAddrSpace());
+    }
+
+    virtual int computeFreelistManagementCost(StructType *ListTy, int Factor, TargetTransformInfo &TTI, const DataLayout &DL) {
+      // Store the next pointer
+      return Factor * TTI.getMemoryOpCost(
+          Instruction::Store,
+          ListTy->getElementType(1)->getPointerTo(),
+          DL.getABITypeAlignment(ListTy->getElementType(1)),
+          DL.getAllocaAddrSpace());
+    }
+
+    virtual int computeAllocFromFreeCost(StructType *ListTy, int Factor, TargetTransformInfo &TTI, const DataLayout &DL) {
+      // Load the next pointer
+      return Factor * TTI.getMemoryOpCost(
+          Instruction::Load,
+          ListTy->getElementType(1)->getPointerTo(),
+          DL.getABITypeAlignment(ListTy->getElementType(1)),
+          DL.getAllocaAddrSpace());
+    }
+
+    virtual int computeEliminationCost(StructType *ListTy, int Factor, TargetTransformInfo &TTI, const DataLayout &DL) {
+      return
+        // One Conditional Branch
+        Factor * TTI.getCFInstrCost(Instruction::Br) +
+        // assume 50% alloca (round up)
+        (computeAllocaCost(ListTy, Factor, TTI, DL) / 2) +
+        // assume 50% allocate from free list  (round down)
+        (computeAllocFromFreeCost(ListTy, Factor, TTI, DL) / 2) +
+        // Store pointer to next item, once
+        Factor * TTI.getMemoryOpCost(
+          Instruction::Store,
+          ListTy->getElementType(1)->getPointerTo(),
+          DL.getABITypeAlignment(ListTy->getElementType(1)),
+          DL.getAllocaAddrSpace()) +
+        // Store parameters for each call itself
+        Factor * OtherCandidatesNumOf * TTI.getMemoryOpCost(
+          Instruction::Store,
+          ListTy->getElementType(0)->getPointerTo(),
+          DL.getABITypeAlignment(ListTy->getElementType(0)),
+          DL.getAllocaAddrSpace());
+    }
+
+    virtual int computeReturnPathCost(StructType *ListTy, int Factor, TargetTransformInfo &TTI, const DataLayout &DL) {
+      return
+        // One Conditional Branch for check return per iteration
+        Factor * OtherCandidatesNumOf * TTI.getCFInstrCost(Instruction::Br) +
+        // Free List Management per Iteration
+        OtherCandidatesNumOf * computeFreelistManagementCost(ListTy, Factor, TTI, DL) +
+        // Cost for loading the next element per iteration
+        Factor * OtherCandidatesNumOf * TTI.getMemoryOpCost(
+          Instruction::Load,
+          ListTy->getPointerTo(),
+          DL.getABITypeAlignment(ListTy),
+          DL.getAllocaAddrSpace());
+    }
+
+    virtual int computeTransformCost(StructType *ListTy, int Factor, TargetTransformInfo &TTI, const DataLayout &DL) {
+      return
+        // One Conditional Branch for check return
+        Factor * TTI.getCFInstrCost(Instruction::Br) +
+        computeEliminationCost(ListTy, Factor, TTI, DL) +
+        computeReturnPathCost(ListTy, Factor, TTI, DL);
+    }
+
+    // Create the return path for the TrueMarker.
+    //
+    // Generated Code:
+    //
+    // mtce.return:
+    //   if (worklist == nullptr)
+    //     goto mtce.retonly;
+    //
+    // mtce.add.freelist:
+    //   nextitem = *worklist;
+    //   ; Freelist management code is inserted here
+    //   goto loopentry;
+    //
+    // mtce.retonly:
+    //   return;
+    //
+    virtual std::tuple<Instruction*, BasicBlock*> insertReturnPath(
+        Instruction *ListItem, Instruction *FreeList, BasicBlock *MtceReturnBB,
+        PHINode *WorkListItemPHI, PHINode *FreeListRecursivePHI,
+        const DebugLoc &DebugLoc, const DataLayout &DL) {
+
+      Instruction *InsertPos;
+
+      // Start by creating a basic block that adds an item to the free list
+      BasicBlock *AddFreeListBB = MtceReturnBB->splitBasicBlock(MtceReturnBB->getFirstNonPHI(),
+                                                               "mtce.add.freelist");
+
+      // Check next item for null and return if necessary
+      InsertPos = MtceReturnBB->getTerminator();
+      ICmpInst *CmpI = new ICmpInst(InsertPos,
+                                    ICmpInst::ICMP_EQ,
+                                    ListItem,
+                                    ConstantPointerNull::get(cast<PointerType>(ListItem->getType())),
+                                    "mtce.worklist.is.null.tm");
+      CmpI->setDebugLoc(DebugLoc);
+
+      // Create the final return only BB and branch to if indicated by the
+      // comparison.
+      BasicBlock *ReturningBB = AddFreeListBB->splitBasicBlock(AddFreeListBB->getTerminator(),
+                                                               "mtce.retonly");
+      BranchInst *CRBI = BranchInst::Create(ReturningBB, AddFreeListBB, CmpI);
+      CRBI->setDebugLoc(DebugLoc);
+      ReplaceInstWithInst(MtceReturnBB->getTerminator(), CRBI);
+
+      // Before managing the freelist the item has to be loaded in order not to
+      // change the next pointer of the worklist improperly. The according
+      // instructions is added here and the branch is updated (including the PHI node)
+      LoadInst *MLI = new LoadInst(ListItem, "mtce.mload", false,
+                                   DL.getABITypeAlignment(WorkListItemPHI->getType()),
+                                   AddFreeListBB->getTerminator());
+      MLI->setDebugLoc(DebugLoc);
+
+      BranchInst *ABI = cast<BranchInst>(AddFreeListBB->getTerminator());
+      BasicBlock *LoopEntry = WorkListItemPHI->getParent();
+      ABI->setSuccessor(0, LoopEntry);
+
+      WorkListItemPHI->addIncoming(MLI, AddFreeListBB);
+
+      return std::make_tuple(ABI, ReturningBB);
+    }
+
+    virtual bool mayPullFromFreeList() {
+      return true;
+    }
+
+    virtual bool checkListType(Type *ListTy, const DataLayout &DL) {
+      return true;
+    }
+
+    virtual void adjustStructFields(ItemListContainer &ItemList) {};
+
+    virtual Instruction *addComputeNextPtr(Value *ItemArray, unsigned ItemIdx, Instruction *InsertPos,
+                                   const DebugLoc &DebugLoc, const DataLayout &DL) {
+      Value *Idx = ConstantInt::get(DL.getIntPtrType(C, DL.getAllocaAddrSpace()), getItemIndex(ItemIdx));
+      Instruction *NextPtr =  GetElementPtrInst::CreateInBounds(cast<PointerType>(ItemArray->getType())->getElementType(),
+                                                                ItemArray,
+                                                                makeArrayRef(&Idx, 1),
+                                                                "mtce.nextptr.for." + Twine(ItemIdx),
+                                                                InsertPos);
+      NextPtr->setDebugLoc(DebugLoc);
+      return NextPtr;
+    }
+
+    // Helper function to encapsulate index calculation
+    virtual unsigned getItemIndex(unsigned Item) {
+      return Item;
+    }
+
+    LLVMContext &C;
+    const unsigned OtherCandidatesNumOf;
+  };
+
+  struct FalseMarker : public TrueMarker {
+    using base = TrueMarker;
+
+    FalseMarker(LLVMContext &C, unsigned OtherCandidatesNumOf) : base(C, OtherCandidatesNumOf) {}
+
+    bool mayPullFromFreeList() override {
+      return false;
+    }
+
+    int computeEliminationCost(StructType *ListTy, int Factor, TargetTransformInfo &TTI, const DataLayout &DL) override {
+      return
+        // assume 50% alloca (round up)
+        computeAllocaCost(ListTy, Factor, TTI, DL) +
+        Factor * TTI.getMemoryOpCost(
+          Instruction::Store,
+          // Pointer to next item
+          ListTy->getElementType(1)->getPointerTo(),
+          DL.getABITypeAlignment(ListTy->getElementType(1)),
+          DL.getAllocaAddrSpace()) +
+        // Store parameters for the call itself
+        Factor * OtherCandidatesNumOf * TTI.getMemoryOpCost(
+          Instruction::Store,
+          ListTy->getElementType(0)->getPointerTo(),
+          DL.getABITypeAlignment(ListTy->getElementType(0)),
+          DL.getAllocaAddrSpace());
+    }
+
+    // Compute Cost for Transformation
+    int computeReturnPathCost(StructType *ListTy, int Factor, TargetTransformInfo &TTI, const DataLayout &DL) override {
+      return
+        // One Conditional Branch for check return per iteration
+        Factor * OtherCandidatesNumOf * TTI.getCFInstrCost(Instruction::Br) +
+        // Cost for loading the next element per iteration
+        Factor * OtherCandidatesNumOf * TTI.getMemoryOpCost(
+          Instruction::Load,
+          ListTy->getPointerTo(),
+          DL.getABITypeAlignment(ListTy),
+          DL.getAllocaAddrSpace());
+
+    }
+
+    // Create the return path for the FalseMarker
+    //
+    // Generated Code:
+    //
+    // mtce.return:
+    //   if (worklist == nullptr)
+    //     goto mtce.retonly;
+    //
+    // mtce.loadnext:
+    //   nextitem = *worklist
+    //   goto loopentry;
+    //
+    // mtce.retonly:
+    //   return
+    //
+    std::tuple<Instruction*, BasicBlock*> insertReturnPath(
+        Instruction *ListItem, Instruction *FreeList, BasicBlock *MtceReturnBB,
+        PHINode *WorkListItemPHI, PHINode *FreeListRecursivePHI,
+        const DebugLoc &DebugLoc, const DataLayout &DL) override {
+
+      Instruction *InsertPos;
+
+      // Start by creating a basic block that adds an item to the free list
+      BasicBlock *LoadNextBB = MtceReturnBB->splitBasicBlock(MtceReturnBB->getFirstNonPHI(),
+                                                            "mtce.loadnext");
+
+      // Check next item for null and return if necessary
+      InsertPos = MtceReturnBB->getTerminator();
+      ICmpInst *CmpI = new ICmpInst(InsertPos,
+                                    ICmpInst::ICMP_EQ,
+                                    ListItem,
+                                    ConstantPointerNull::get(cast<PointerType>(ListItem->getType())),
+                                    "mtce.worklist.is.null.fm");
+      CmpI->setDebugLoc(DebugLoc);
+
+      // Create the final return only BB and branch to if indicated by the
+      // comparison.
+      BasicBlock *ReturningBB = LoadNextBB->splitBasicBlock(LoadNextBB->getTerminator(),
+                                                            "mtce.retonly");
+
+      BranchInst *CRBI = BranchInst::Create(ReturningBB, LoadNextBB, CmpI);
+      CRBI->setDebugLoc(DebugLoc);
+      ReplaceInstWithInst(MtceReturnBB->getTerminator(), CRBI);
+
+      BranchInst *LNBI = cast<BranchInst>(LoadNextBB->getTerminator());
+      BasicBlock *LoopEntry = WorkListItemPHI->getParent();
+      LNBI->setSuccessor(0, LoopEntry);
+
+      LoadInst *MLI = new LoadInst(ListItem, "mtce.mload", false,
+                                   DL.getABITypeAlignment(WorkListItemPHI->getType()),
+                                   LoadNextBB->getTerminator());
+      MLI->setDebugLoc(DebugLoc);
+      WorkListItemPHI->addIncoming(MLI, LoadNextBB);
+      FreeListRecursivePHI->addIncoming(FreeList, LoadNextBB);
+
+      return std::make_tuple(nullptr, ReturningBB);
+    }
+  };
+
+  struct TaggedMarker : public TrueMarker {
+    using base = TrueMarker;
+
+    TaggedMarker(LLVMContext &C, unsigned OtherCandidatesNumOf) : base(C, OtherCandidatesNumOf) {}
+
+    int computeAllocaCost(StructType *ListTy, int Factor, TargetTransformInfo &TTI, const DataLayout &DL) override {
+      // We factor in the stores to create the inner links
+      return
+        base::computeAllocaCost(ListTy, Factor, TTI, DL) +
+        // Set the the marker bit, except for the first (accounted in
+        // computeEliminationCost) and last (no bit to set for the marked item)
+        Factor * (OtherCandidatesNumOf - 2) * TTI.getArithmeticInstrCost(
+            Instruction::Or, DL.getIntPtrType(ListTy->getPointerTo()));
+    }
+
+    int computeEliminationCost(StructType *ListTy, int Factor, TargetTransformInfo &TTI, const DataLayout &DL) override {
+      return
+        base::computeEliminationCost(ListTy, Factor, TTI, DL) +
+        Factor * TTI.getArithmeticInstrCost(
+            Instruction::Or, DL.getIntPtrType(ListTy->getPointerTo()));
+    }
+
+    // Compute Cost for Transformation
+    int computeReturnPathCost(StructType *ListTy, int Factor, TargetTransformInfo &TTI, const DataLayout &DL) override {
+      return
+        // One and instruction to check bit 0 per iteration
+        Factor * OtherCandidatesNumOf * TTI.getArithmeticInstrCost(
+            Instruction::And, DL.getIntPtrType(ListTy->getPointerTo())) +
+        // One Conditional Branch for check bit 0
+        Factor * OtherCandidatesNumOf * TTI.getCFInstrCost(Instruction::Br) +
+        // Mask bit 0, once per unmarked item
+        // One and instruction to check bit 0 per iteration
+        Factor * (OtherCandidatesNumOf - 1) * TTI.getArithmeticInstrCost(
+            Instruction::And, DL.getIntPtrType(ListTy->getPointerTo())) +
+        // Update freelist only once per block
+        computeFreelistManagementCost(ListTy, Factor, TTI, DL) +
+        // Check for return only once per block
+        Factor * TTI.getCFInstrCost(Instruction::Br) +
+        // Cost for loading the next element per iteration
+        Factor * OtherCandidatesNumOf * TTI.getMemoryOpCost(
+          Instruction::Load,
+          ListTy->getPointerTo(),
+          DL.getABITypeAlignment(ListTy),
+          DL.getAllocaAddrSpace());
+
+    }
+
+    // Tagged marker only works if the list alignment is 2 or higher. Bit 0 is
+    // used to mark the last item in each block.
+    bool checkListType(Type *ListTy, const DataLayout &DL) override {
+      // Check if the last bit in the address is available
+      return (DL.getABITypeAlignment(ListTy) % 2 == 0);
+    }
+
+    // Bit 0 in the next pointer in the previous items encodes the marker bit.
+    // If this is 1 then a further item exist in this allocation block,
+    // otherwise the last item is reached and the block can be added to the free
+    // list.
+    Instruction *addComputeNextPtr(Value *ItemArray, unsigned ItemIdx, Instruction *InsertPos,
+                                   const DebugLoc &DebugLoc, const DataLayout &DL) override {
+      Instruction *UntaggedNextPtr = base::addComputeNextPtr(ItemArray, ItemIdx,
+                                                             InsertPos, DebugLoc, DL);
+      if (ItemIdx == OtherCandidatesNumOf - 1)
+        return UntaggedNextPtr;
+
+      IntegerType *IntPtrType = cast<IntegerType>(DL.getIntPtrType(UntaggedNextPtr->getType()));
+      Instruction *PtrToInt = new PtrToIntInst(UntaggedNextPtr,
+                                               IntPtrType,
+                                               "mtce.ptoi." + Twine(ItemIdx), InsertPos);
+      PtrToInt->setDebugLoc(DebugLoc);
+      APInt Mark(IntPtrType->getBitWidth(), 1);
+      Instruction *TaggedIntPtr = BinaryOperator::Create(Instruction::Or,
+                                                         PtrToInt, ConstantInt::get(C, Mark),
+                                                         "mtce.listhead.msked.intptr",
+                                                         InsertPos);
+      Instruction *NextPtr = new IntToPtrInst(TaggedIntPtr, UntaggedNextPtr->getType(),
+                                              "mtce.tagged.next.for." + Twine(ItemIdx),
+                                              InsertPos);
+      NextPtr->setDebugLoc(DebugLoc);
+      return NextPtr;
+    }
+
+    // Create the return path for the TaggedMarker.
+    //
+    // Generated Code:
+    //
+    // mtce.return:
+    //   intptr_t worlist_int = (intptr_t)worklist;
+    //   if ( worklist_int & 1 == 0)
+    //     goto mtce.check_return;
+    //
+    // mtce.unmarked:
+    //   nextitem = *(worklist_ty*)(worklist_int & ~1);
+    //   goto loopentry;
+    //
+    // mtce.add.freelist:
+    //   nextitem = *worklist;
+    //   ; Freelist management code is inserted here
+    //   goto loopentry;
+    //
+    // mtce.retonly:
+    //   return;
+    //
+    std::tuple<Instruction*, BasicBlock*> insertReturnPath(
+        Instruction *ListItem, Instruction *FreeList, BasicBlock *MtceReturnBB,
+        PHINode *WorkListItemPHI, PHINode *FreeListRecursivePHI,
+        const DebugLoc &DebugLoc, const DataLayout &DL) override {
+      LLVMContext &C = MtceReturnBB->getContext();
+      Instruction *InsertPos;
+      BasicBlock *LoopEntry = WorkListItemPHI->getParent();
+
+      // Start by creating a basic block to check the marker bit
+      BasicBlock *CheckReturnBB = MtceReturnBB->splitBasicBlock(MtceReturnBB->getFirstNonPHI(),
+                                                            "mtce.check.return");
+      InsertPos = MtceReturnBB->getTerminator();
+      IntegerType *IntPtrType = cast<IntegerType>(DL.getIntPtrType(ListItem->getType()));
+
+      // Convert the pointer into integer to to perform logical operations
+      Instruction *PtrToInt = new PtrToIntInst(ListItem, IntPtrType,
+                                               "mtce.ptoi", InsertPos);
+
+      PtrToInt->setDebugLoc(DebugLoc);
+      Instruction *Mark = new TruncInst(PtrToInt, IntegerType::get(C, 1), "mtce.mark", InsertPos);
+
+      // Mask the tag bit and convert it back int pointer type
+      APInt Mask = IntPtrType->getMask();
+      Mask.clearBit(0);
+      Instruction *MWLIntPtr = BinaryOperator::Create(Instruction::And,
+                                                      PtrToInt, ConstantInt::get(C, Mask),
+                                                      "mtce.masked.worklist.intptr",
+                                                      InsertPos);
+      MWLIntPtr->setDebugLoc(DebugLoc);
+      Instruction *MWL = new IntToPtrInst(MWLIntPtr, ListItem->getType(),
+                                          "mtce.masked.worklist",
+                                          InsertPos);
+      MWL->setDebugLoc(DebugLoc);
+
+      // Create a branch target for unmarked items, and replace the terminator
+      // by the conditional version.
+      BasicBlock *UnmarkedItemBB = BasicBlock::Create(C, "mtce.unmarked",
+                                                      CheckReturnBB->getParent(),
+                                                      CheckReturnBB);
+      BranchInst *MarkBI = BranchInst::Create(UnmarkedItemBB, CheckReturnBB, Mark);
+      MWL->setDebugLoc(DebugLoc);
+      ReplaceInstWithInst(MtceReturnBB->getTerminator(), MarkBI);
+
+      // When reaching unmarked items, just load the item and branch into the
+      // loop entry
+      LoadInst *UMLI = new LoadInst(MWL, "mtce.umload", false,
+                                    DL.getABITypeAlignment(WorkListItemPHI->getType()),
+                                    UnmarkedItemBB);
+
+      UMLI->setDebugLoc(DebugLoc);
+
+      BranchInst *UMBI = BranchInst::Create(LoopEntry, UnmarkedItemBB);
+      UMBI->setDebugLoc(DebugLoc);
+
+      WorkListItemPHI->addIncoming(UMLI, UnmarkedItemBB);
+      FreeListRecursivePHI->addIncoming(FreeList, UnmarkedItemBB);
+
+      // The code generated above handles the case for marked items.
+      // Next split if a block for checking the return condition.
+      //
+      // Note, the unmasked version of the worklist pointer can be used, since
+      // we know that the tag bit is zero.
+      BasicBlock *AddFreeListBB = CheckReturnBB->splitBasicBlock(CheckReturnBB->getFirstNonPHI(),
+                                                                 "mtce.add.freelist");
+      InsertPos = CheckReturnBB->getTerminator();
+      ICmpInst *CmpI = new ICmpInst(InsertPos,
+                                    ICmpInst::ICMP_EQ,
+                                    ListItem,
+                                    ConstantPointerNull::get(cast<PointerType>(ListItem->getType())),
+                                    "mtce.worklist.is.null.tg");
+      CmpI->setDebugLoc(DebugLoc);
+
+      // Create the final return only BB and branch to if indicated by the
+      // comparison.
+      BasicBlock *ReturningBB = AddFreeListBB->splitBasicBlock(AddFreeListBB->getTerminator(),
+                                                               "mtce.retonly");
+      BranchInst *CRBI = BranchInst::Create(ReturningBB, AddFreeListBB, CmpI);
+      CRBI->setDebugLoc(DebugLoc);
+      ReplaceInstWithInst(CheckReturnBB->getTerminator(), CRBI);
+
+      // Before managing the freelist the item has to be loaded in order not to
+      // change the next pointer of the worklist improperly. The according
+      // instructions is added here and the branch is updated (including the PHI node)
+      LoadInst *MLI = new LoadInst(ListItem, "mtce.mload", false,
+                                   DL.getABITypeAlignment(WorkListItemPHI->getType()),
+                                   AddFreeListBB->getTerminator());
+      MLI->setDebugLoc(DebugLoc);
+
+      BranchInst *ABI = cast<BranchInst>(AddFreeListBB->getTerminator());
+      ABI->setSuccessor(0, LoopEntry);
+
+      WorkListItemPHI->addIncoming(MLI, AddFreeListBB);
+
+      return std::make_tuple(ABI, ReturningBB);
+    }
+  };
+
+  typedef std::forward_list<RecursiveCall> CandidateContainer;
+
+  /// Current DataLayout.
+  const DataLayout &DL;
+
+  /// Current Context.
+  LLVMContext &C;
+
+  /// Target Information
+  TargetTransformInfo &TTI;
+
+  /// Optimization Remark Emitter to use.
+  OptimizationRemarkEmitter &ORE;
+
+  /// Function the pass is working on.
+  Function &F;
+
+  /// Alias Analysis of the current function.
+  AliasAnalysis &AA;
+
+  /// Type of list elements for the selected marker algorithm;
+  StructType *ListTy;
+
+  /// List of call candidates to be eliminated, this list contains the single
+  /// first candidate.
+  CandidateContainer FirstCandidate;
+
+  /// List of the remaining call candidates to be eliminated.
+  CandidateContainer OtherCandidates;
+
+  /// Number of elements in the OtherCandidates List
+  unsigned OtherCandidatesNumOf;
+
+  /// List of arguments for the function H.
+  SmallVector<IncomingArgument, 4> IncomingArguments;
+
+  /// List of movable instructions.
+  SmallVector<Instruction*, 16> MovableInst;
+
+  /// List of static instructions.
+  SmallVector<Instruction*, 16> StaticInst;
+
+  /// Selected Marker Algorithm
+  std::unique_ptr<TrueMarker> Marker;
+
+  /// Find all recursive calls within the current basic block and add them to
+  /// the list of candidates.
+  ///
+  /// \returns true if at leas one recursive call is found.
+  bool appendAllRecursiveCallsInBB(Function &F,
+                             BasicBlock &BB,
+                             unsigned &NumOf,
+                             CandidateContainer &Candidates,
+                             CandidateContainer::iterator &LastCandidate) {
+    bool found = false;
+    for (auto &I: BB) {
+      CallInst *CI = dyn_cast<CallInst>(&I);
+      if (CI && CI->getCalledFunction() == &F) {
+        LastCandidate = Candidates.emplace_after(LastCandidate, CI, NumOf);
+        NumOf++;
+        found = true;
+      }
+    }
+    return found;
+  }
+
+  /// Test if the given basic block is a return only BB.
+  bool isReturnOnlyBB(const BasicBlock &BB) const {
+    const ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator());
+    return (RI != nullptr && BB.getFirstNonPHIOrDbg() == RI);
+  }
+
+  /// Return true if it is safe to move the specified
+  /// instruction from after the call to before the call, assuming that all
+  /// instructions between the call and this instruction are movable.
+  ///
+  /// This is mostly a copy of canMoveAboveCall within TailCallElimination.
+  ///
+  /// \returns true if the instruction can be moved.
+  bool canMoveAbove(Instruction *I, Instruction *SI) {
+    // FIXME: We can move load/store/call/free instructions above the call if the
+    // call does not mod/ref the memory location being processed.
+    if (I->mayHaveSideEffects())  // This also handles volatile loads.
+      return false;
+
+    if (LoadInst *L = dyn_cast<LoadInst>(I)) {
+      // Loads may always be moved above calls without side effects.
+      if (SI->mayHaveSideEffects()) {
+        const DataLayout &DL = L->getModule()->getDataLayout();
+        if (isModSet(AA.getModRefInfo(SI, MemoryLocation::get(L))) ||
+            !isSafeToLoadUnconditionally(L->getPointerOperand(),
+                                         L->getAlignment(), DL, L))
+          return false;
+      }
+    }
+
+    // Otherwise, if this is a side-effect free instruction, check to make sure
+    // that it does not use the return value of the call.  If it doesn't use the
+    // return value of the call, it must only use things that are defined before
+    // the call, or movable instructions between the call and the instruction
+    // itself.
+    return !is_contained(I->operands(), SI);
+  }
+
+  /// Analyze the current function and return true if it is feasible and
+  /// profitable to apply the transformation.
+  ///
+  /// \returns the number of candidates found.
+  bool analyzeFunction() {
+    // Number of basic blocks with all recursive calls
+    int BBwithCandidates = 0;
+    bool hasAllocas = false;
+
+    CandidateContainer Candidates;
+    auto LastCandidate = Candidates.before_begin();
+
+    // We cannot handle variadic arguments.
+    if (F.isVarArg())
+      return false;
+
+    // TODO (improvement): There might be ways to implement non void returns. For example
+    // similar to accumulator TREs.
+    if (F.getReturnType() != Type::getVoidTy(F.getContext()))
+      return false;
+
+    // No, can't be done.
+    if (F.callsFunctionThatReturnsTwice())
+      return false;
+
+    unsigned CandidatesNumOf = 0;
+    for (auto &BB: F) {
+      // EH pads cannot be handled at all
+      if (BB.isEHPad()) {
+        return false;
+      }
+
+      // Alloca instructions are not allowed.
+      //
+      // TODO (improvement): Alloca can be theoretically be handled if it can be
+      // proven that the used memory lifetime is bounded before the first call.
+      // In this case the alloca can be moved into the new entry block the same
+      // way tail call elimination does.
+      for (auto &I: BB) {
+        if (isa<AllocaInst>(&I)) {
+          hasAllocas = true;
+        }
+      }
+
+      if (isa<ReturnInst>(BB.getTerminator())) {
+        // For returning blocks find all candidates.
+        if (appendAllRecursiveCallsInBB(F, BB, CandidatesNumOf, Candidates, LastCandidate)) {
+          BBwithCandidates++;
+        }
+      }
+      else if (BranchInst *BI = dyn_cast<BranchInst>(BB.getTerminator())) {
+        // For non-returning blocks find all candidates, iff it has a single
+        // successor which is a return only block.
+        if (BI->isUnconditional() && isReturnOnlyBB(*BI->getSuccessor(0))) {
+          if (appendAllRecursiveCallsInBB(F, BB, CandidatesNumOf, Candidates, LastCandidate)) {
+            BBwithCandidates++;
+          }
+        }
+      }
+
+      // Currently only one BB with tail recursive calls is allowed.
+      //
+      // TODO (improvement): There might be interesting cases with recursive
+      // calls in more then one BB.
+      if (BBwithCandidates > 1) {
+        ORE.emit([&]() {
+            return  OptimizationRemarkMissed(DEBUG_TYPE, "Multi tail call elimination missed",
+                                             F.getEntryBlock().front().getDebugLoc(), &F.getEntryBlock())
+              << ore::NV("Reason", "Multiple Basic Blocks");
+          });
+        return false;
+      }
+    }
+
+    // No MTCE required. Leave to TRE.
+    if (CandidatesNumOf < 2) {
+      return false;
+    }
+
+    if (hasAllocas) {
+      ORE.emit([&]() {
+          return  OptimizationRemarkMissed(DEBUG_TYPE, "Multi tail call elimination missed",
+                                           F.getEntryBlock().front().getDebugLoc(), &F.getEntryBlock())
+            << ore::NV("Reason", "Functions containing alloca instructions not yet supported");
+        });
+      return false;
+    }
+
+    BasicBlock *BB = Candidates.front().getCallInst()->getParent();
+    Instruction *TI = BB->getTerminator();
+
+    // All instructions between the first candidate and the last candidate are
+    // partitioned as static (unmovable) instructions and instructions (movable)
+    // The resulting two arrays are ordered in the required execution sequence.
+    //
+    // This means that the execution order (MovableInst | StaticInst) is legal
+    // and equivalent to the original sequence.
+    //
+    // Algorithm:
+    //
+    // * Iterate over all instructions between Candidate[n] and Candidate[n+1].
+    //
+    // * Test if the instruction can be moved above all identified static
+    //   instructions.
+    //
+    // * If it can, add it to the movable instruction list, otherwise add it to
+    //   the static instruction list.
+    //
+    // * Consecutively add static instructions if necessary.
+
+    // Iterate over all instruction the interval
+    // )Candidate[x]. Candidate[x+1]). Virtually include the terminator as
+    // additional candidate.
+    //
+    // Setup the first iteration interval between Candidate[0] and Candidate[1].
+    // Note: The list has at least two members.
+    auto Cand0 = Candidates.begin();
+    auto Cand1 = next(Cand0);
+    BasicBlock::iterator BBI(Cand0->getCallInst());
+    auto CandI = Cand1;
+
+    // The first candidate is always a single call which is a static instruction.
+    StaticInst.push_back(&*BBI);
+
+    while (true) {
+      // Identity the instruction that marks the end of the current pair.
+      //
+      // It is either the CandI, or the terminator instruction (which is virtually added)
+      Instruction *E;
+      if (CandI != Candidates.end()) {
+        E = CandI->getCallInst();
+      }
+      else {
+        E = TI;
+      }
+
+      // advance the iterator first, because we start with the first call
+      // instruction which is already added.
+      ++BBI;
+
+      // If we reached the terminator we are done.
+      if (&*BBI == TI)
+        (&*BBI == E) {
+        StaticInst.push_back(&*BBI);
+        // Advance the current iteration interval to next candidate.
+        ++CandI;
+        continue;
+      }
+
+      // Test if the instruction can be moved and add it accordingly.
+      bool movable = true;
+      for (auto &SI: StaticInst) {
+        if (canMoveAbove(&*BBI, &*SI))
+          continue;
+        movable = false;
+        break;
+      }
+
+      if (movable)
+        MovableInst.push_back(&*BBI);
+      else
+        StaticInst.push_back(&*BBI);
+    }
+
+    // Assign all static instruction to the candidates. We do it naively by just
+    // assigning the instruction in between to candidates to second one.
+    auto SI = StaticInst.begin();
+    for (auto &Cand: Candidates) {
+      while (*SI != Cand.getCallInst()) {
+        Cand.addInstruction(*SI);
+        SI++;
+      }
+      // Skip candidate's call inst.
+      SI++;
+    }
+
+    // If not all static instructions have been assigned, then there are unmovable tailing
+    // instructions and the transformation is abandoned.
+    if (SI != StaticInst.end()) {
+      ORE.emit([&]() {
+          return  OptimizationRemarkMissed(DEBUG_TYPE, "Multi tail call elimination missed", *SI)
+            << ore::NV("Reason", "Instructions after last call.");
+        });
+      return false;
+    }
+
+    // Analyze each call candidate if the call and the corresponding static
+    // instructions are suitable to be eliminated.
+    for (auto &Cand: Candidates) {
+      if (!Cand.analyze()) {
+        ORE.emit([&]() {
+            return  OptimizationRemarkMissed(DEBUG_TYPE, "Multi tail call elimination missed", Cand.getCallInst())
+              << ore::NV("Reason", "RecursiveCall not viable");
+          });
+        return false;
+      }
+    }
+
+    // Test if the functions generated during analysis (step before) behave equivalently.
+    for (auto CCI = std::next(Cand1), CCE = Candidates.end();
+         CCI != CCE; ++CCI) {
+
+      // Test if the function H(...) of both candidates match.
+      if (!CCI->compareFunctionH(&*Cand1)) {
+        ORE.emit([&]() {
+            return  OptimizationRemarkMissed(DEBUG_TYPE, "Multi tail call elimination missed", CCI->getCallInst())
+              << ore::NV("Reason", "RecursiveCall nonviable (Function H do not match)");
+          });
+        return false;
+      }
+    }
+
+    // Analyze the operands for each function H and categorize them into
+    // Stacked, Constant and Static.
+    IncomingArguments.resize(Cand1->getIncomingOperands().size(), IncomingArgument());
+    for (auto CCI = Cand1, CCE = Candidates.end(); CCI != CCE; ++CCI) {
+      assert(IncomingArguments.size() == CCI->getIncomingOperands().size() && "Operand size mismatch");
+
+      auto IAI = IncomingArguments.begin();
+      for (Use *U: CCI->getIncomingOperands()) {
+        IAI->addUse(&*CCI, U);
+
+        // If the value is an Argument we need to check the position in the
+        // first candidate too. It is sufficient here to just execute the checks.
+        if (isa<Argument>(U->get())) {
+          IAI->addUse(&*Cand0, U, true);
+        }
+        ++IAI;
+      }
+    }
+
+    // Count the categories.
+    int NumStaticArguments = 0;
+    int NumConstantArguments = 0;
+    for (auto &IA: IncomingArguments) {
+      if (IA.isConstant())
+        NumConstantArguments++;
+      if (IA.isStatic())
+        NumStaticArguments++;
+    }
+
+
+    // If we do not spare at least to arguments from the list, no improvement is
+    // expected so the analysis bails with negative result.
+    // TODO: This might be too defensive as further checks are added below
+    if (IncomingArguments.size() - NumStaticArguments - NumConstantArguments + 2 > F.arg_size()) {
+      LLVM_DEBUG(dbgs() << "Will not apply - no argument reduction.\n");
+      ORE.emit([&]() {
+          return  OptimizationRemarkMissed(DEBUG_TYPE, "Multi tail call elimination missed", Cand0->getCallInst())
+            << ore::NV("Reason", "No argument reduction.");
+        });
+      return false;
+    }
+
+    OtherCandidatesNumOf = CandidatesNumOf - 1;
+
+    // Select a suitable marker algorithm
+    if (DisableFreeList) {
+      Marker.reset(new FalseMarker(C, OtherCandidatesNumOf));
+    }
+    else {
+      if (OtherCandidatesNumOf == 1) {
+        Marker.reset(new TrueMarker(C, OtherCandidatesNumOf));
+      }
+      else {
+        Marker.reset(new TaggedMarker(C, OtherCandidatesNumOf));
+      }
+    }
+
+    ListTy = createListType();
+
+    // Check if the marker is compatible with the ListTy. e.g.: TaggedMarker
+    // require a list alignment of 2 or multiples of 2.
+    if (!Marker->checkListType(ListTy, DL)) {
+      LLVM_DEBUG(dbgs() << "Will not apply - ListTy incompatible.\n");
+      ORE.emit([&]() {
+          return  OptimizationRemarkMissed(DEBUG_TYPE, "Multi tail call elimination missed", Cand0->getCallInst())
+            << ore::NV("Reason", "Invalid ListTy.");
+        });
+      return false;
+    }
+
+    // Just a factor to avoid rounding errors.
+    int Factor = 2;
+
+    int OldCost = 0;
+    for (auto &Cand: Candidates)
+      OldCost += TTI.getInstructionCost(Cand.getCallInst(), TargetTransformInfo::TCK_Latency);
+    OldCost *= Factor;
+
+    int NewCost = Marker->computeTransformCost(ListTy, Factor, TTI, DL);
+
+    if (NewCost >= OldCost) {
+      LLVM_DEBUG(dbgs() << "Will not apply - too costly\n");
+      ORE.emit([&]() {
+          return  OptimizationRemarkMissed(DEBUG_TYPE, "Multi tail call elimination missed", Cand0->getCallInst())
+            << ore::NV("Reason", "Too costly")
+            << ore::NV("OldCost", OldCost)
+            << ore::NV("NewCost", NewCost);
+        });
+      return false;
+    }
+
+    // Splice the list into the two output lists. One contains the first element
+    // only, the other containing the remaining elements.
+    OtherCandidates.splice_after(OtherCandidates.before_begin(), Candidates,
+                                 Candidates.begin(), Candidates.end());
+    FirstCandidate.splice_after(FirstCandidate.before_begin(), Candidates);
+
+    return true;
+  }
+
+  /// Creates the worklist structure used to model the pending call list.
+  StructType *createListType() const {
+    // Collect the types of the operands
+    SmallVector<Type*,4> ListElements;
+    for (auto &IA: IncomingArguments) {
+      if (IA.isStacked())
+        ListElements.push_back(IA.getType());
+    }
+
+    // Create a new StructType representing the list.
+    StructType *ListTy = StructType::create(C, "mtce.listty");
+    StructType *OperandsTy = StructType::get(C, makeArrayRef(ListElements), false);
+
+    SmallVector<Type*, 2> LinkItems;
+    LinkItems.push_back(ListTy->getPointerTo());
+    Marker->adjustStructFields(LinkItems);
+    StructType *LinkTy = StructType::get(C, makeArrayRef(LinkItems), false);
+
+    Type *ListItemTys[2] = { OperandsTy, LinkTy };
+    ListTy->setBody(makeArrayRef(ListItemTys, 2), false);
+
+    return ListTy;
+  }
+
+  /// Prepare the entry block by splitting of the loop entry. The following
+  /// PHINodes are added to the loop entry.
+  ///
+  /// * One for each argument (Also the uses are adjusted).
+  /// * List head representing the root of the worklist.
+  /// * If enabled the root to the free list.
+  template <typename ArgumentPHIContainer>
+  std::tuple<PHINode*,
+             PHINode*,
+             BasicBlock*>
+  prepareEntry(ArgumentPHIContainer &ArgumentPHIs) {
+    const DebugLoc &DebugLoc = FirstCandidate.front().getCallInst()->getDebugLoc();
+
+    // Split the entry block at the first viable instruction.
+    BasicBlock *Entry = &F.getEntryBlock();
+    BasicBlock *LoopEntry = Entry->splitBasicBlock(Entry->getFirstNonPHI(), "mtce.loop");
+    Entry->getTerminator()->setDebugLoc(DebugLoc);
+
+    // Setup List PHI nodes for the loop entry.
+    Instruction *InsertPos = &LoopEntry->front();
+    PHINode *ListHead = PHINode::Create(ListTy->getPointerTo(), 2,
+                                        "mtce.listhead",
+                                        InsertPos);
+    ListHead->addIncoming(ConstantPointerNull::get(ListTy->getPointerTo()), Entry);
+
+    // Setup the FreeList if necessary.
+    PHINode *FreeList = nullptr;
+    FreeList = PHINode::Create(ListTy->getPointerTo(), 2,
+                               "mtce.freelist",
+                               InsertPos);
+    FreeList->addIncoming(ConstantPointerNull::get(ListTy->getPointerTo()), Entry);
+
+    // Setup Argument PHI nodes for the loop entry - see TRE.
+    for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end();
+         I != E; ++I) {
+      PHINode *PN = PHINode::Create(I->getType(), 2,
+                                    "mtce." + I->getName(), InsertPos);
+      I->replaceAllUsesWith(PN); // Everyone use the PHI node now!
+      PN->addIncoming(&*I, Entry);
+      ArgumentPHIs.push_back(PN);
+    }
+
+    return std::make_tuple(ListHead, FreeList, LoopEntry);
+  }
+
+  /// Adjust all ArgumentPHIs according to the CallInst to be eliminated.
+  ///
+  /// Arguments are taken from CI. IncomingBB is used as the BB to add into to
+  /// PHI list.
+  template <typename ArgumentPHIContainer>
+  void adjustPHIsFromCallInst(
+      BasicBlock *IncomingBB, CallInst *CI,
+      const ArgumentPHIContainer &ArgumentPHIs) {
+    // Adjust PHI Nodes in the loop entry
+    CallInst::op_iterator Arg = CI->arg_begin();
+    for (auto &PHI: ArgumentPHIs) {
+      assert(Arg != CI->arg_end() && "Callsite does not provide enough arguments.");
+      PHI->addIncoming(Arg->get(), IncomingBB);
+      Arg++;
+    }
+    assert(Arg == CI->arg_end() && "Callsite provided too many arguments.");
+  }
+
+  /// Eliminates the first call just like it is done within tail call
+  /// elimination.  Additionally it moves all movable instructions before the
+  /// first call candidate, and it splits the remaining calls into a separate
+  /// basic block. ArgumentPHIs are adjusted accordingly.
+  //
+  // calling.bb:
+  //   ...
+  //   call @recursive ...
+  //
+  //   %movable1 = add ...
+  //   %static1 = call @fn1, %movable1
+  //   call @recursive ...
+  //
+  //   %movable2 = add ...
+  //   %static2 = call @fn1, %movable2
+  //   call @recursive ...
+  //
+  //   br ret
+  //
+  // ->
+  //
+  // calling.bb:
+  //   ...
+  //   %movable1 = add ...
+  //   %movable2 = add ...
+  //   br %loopentry
+  //
+  // H1:
+  //
+  //   %static1 = call @fn1, %movable1
+  //   call @recursive ...
+  //
+  //   %static2 = call @fn1, %movable2
+  //   call @recursive ...
+  //
+  //   br ret
+  template <typename ArgumentPHIContainer>
+  std::tuple<BasicBlock*, BasicBlock*>
+  eliminateFirstCall(RecursiveCall &CC,
+                     BasicBlock *LoopEntry,
+                     const ArgumentPHIContainer &ArgumentPHIs) {
+    CallInst *CI = CC.getCallInst();
+
+    // Move all movable instruction before this call instruction.
+    for (auto &I: MovableInst)
+      I->moveBefore(CI);
+
+    // Split off the remaining static (unmovable instructions).
+    BasicBlock *CallerBB = CI->getParent();
+    // For now MtceReturnBB still contains all static instructions. These will be
+    // be removed and this BB becomes the new return block.
+    BasicBlock *MtceReturnBB = CallerBB->splitBasicBlock(CI, "mtce.return");
+    BranchInst *BI = cast<BranchInst>(CallerBB->getTerminator());
+
+    // Create the loop to eliminate the call the the first candidate.
+    BI->setSuccessor(0, LoopEntry);
+    BI->setDebugLoc(CI->getDebugLoc());
+
+    adjustPHIsFromCallInst(CallerBB, CI, ArgumentPHIs);
+    CC.eraseSourceCallInst();
+
+    return std::make_tuple(CallerBB, MtceReturnBB);
+  }
+
+  /// Adds code to bypass the alloca (inserting by eliminateOtherCalls) with
+  /// code to picking it from the freelist.
+  //
+  // calling.bb:
+  //   ...
+  //   %movable.0 = add ...
+  //   %movable.k = add ...
+  //
+  //   ; allocate memory on stack
+  //   %itemptr = alloca %listty, 2
+  //
+  //   ; compute address if list items
+  //   %gep.0 = getelementptr %listty, 0
+  //   ...
+  //   %gep.k = getelementptr %listty, k
+  //
+  //   ; setup and store list item [0]; set mark to false
+  //   mark.0 = <marker instructions>
+  //   %nextptr.0 = getelementptr %listty, itemptr, 0, 1, 0
+  //   store %gep.1, *nextptr.0
+  //   ...
+  //   mark.k = <marker instructions>
+  //   %nextptr.k = getelementptr %listty, itemptr, k, 1, 0
+  //   store %listhead, %nextptr.k
+  //
+  //   ; setup and store operands entries
+  //   %opreands.0 = getelementptr %listty, itemptr, 0, 0
+  //   %arg.0-0 = insertvalue mark_0, undef, %movable.0-0
+  //   store %arg.0-0, %operands.0
+  //   ...
+  //
+  //   -> [AI = %itemptr, AllocaEnd = %nextptr.k, NewHeadPtr = %gep.0]
+  //
+  // calling.bb:
+  //   ...
+  //   %movable.0 = add ...
+  //   %movable.k = add ...
+  //
+  // alloca:
+  //   ; allocate memory on stack
+  //   %itemptr = alloca %listty, 2
+  //
+  //   ; compute address if list items
+  //   %gep.0 = getelementptr %listty, 0
+  //   ...
+  //   %gep.k = getelementptr %listty, k
+  //
+  //   ; setup and store list item [0]; set mark to false
+  //   mark.0 = <marker instructions>
+  //   %nextptr.0 = getelementptr %listty, %itemptr, 0, 1, 0
+  //   store %gep.1, *nextptr.0
+  //   ...
+  //   mark.k = <marker instructions>
+  //
+  // allocfromfree:
+  //   %gep.0.fromfree = getelementptr %listty, %freelist, ...
+  //   %nextptr = getelementptr %listty, 0, #STRUCT_IDX_OFNEXT
+  //   %next = load %nextptr
+  //   br calling.bb.1
+  //
+  // calling.bb.1:
+  //   %newheadptrphi = phi %listty, [%gep.0, %allocaa], [%gep.0.fromfree.fromfree, %allocfromfree]
+  //   %allocated = phi %listty, [%itemptr, %alloca], [%freelist, %allocfromfree]
+  //   %newfreelist = phi %listty, [%freelist, %alloca], [%next, %allocfromfree]
+  //
+  //   %nextptr.k = getelementptr %listty, %allocated, k, 1, 0
+  //   store %listhead, %nextptr.k
+  //
+  //   ; setup and store operands entries
+  //   %opreands.0 = getelementptr %listty, %allocated, 0, 0
+  //   %arg.0-0 = insertvalue mark_0, undef, %movable.0-0
+  //   store %arg.0-0, %operands.0
+  //   ...
+  void replaceAllocaWithFreeList(AllocaInst *AI, Instruction *AllocaEnd,
+                                 Instruction *NewHeadPtr, PHINode *FreeList) {
+    // Never generate code to pull from free list if Marker does not require it.
+    if (!Marker->mayPullFromFreeList()) {
+      FreeList->addIncoming(FreeList, AI->getParent());
+      return;
+    }
+
+    const DebugLoc &DebugLoc = AI->getDebugLoc();
+    BasicBlock *PreCallingBB = AI->getParent();
+    BasicBlock::iterator InsertPos(AI);
+
+    // Split and create BBs.
+    BasicBlock *AllocaBB = PreCallingBB->splitBasicBlock(InsertPos, "mtce.alloca");
+    InsertPos = BasicBlock::iterator(AllocaEnd);
+    BasicBlock *CallingBB = AllocaBB->splitBasicBlock(InsertPos, PreCallingBB->getName() + ".1");
+    BasicBlock *PullBB = BasicBlock::Create(C, "mtce.allocfromfree", &F, CallingBB);
+
+    // Compute address
+    Value *IdxNext[] = {
+      ConstantInt::get(DL.getIntPtrType(C, DL.getAllocaAddrSpace()), Marker->getItemIndex(0)),
+      ConstantInt::get(IntegerType::get(C, 32), 1),
+      ConstantInt::get(IntegerType::get(C, 32), 0),
+    };
+    Instruction *PulledItem0 = GetElementPtrInst::CreateInBounds(ListTy,
+                                                                   FreeList,
+                                                                   makeArrayRef(IdxNext, 1),
+                                                                   "mtce.pulleditemgep.0",
+                                                                   PullBB);
+    PulledItem0->setDebugLoc(DebugLoc);
+
+    IdxNext[0] = ConstantInt::get(DL.getIntPtrType(C, DL.getAllocaAddrSpace()), Marker->getItemIndex(OtherCandidatesNumOf - 1));
+    Instruction *NewFreePtr = GetElementPtrInst::CreateInBounds(ListTy,
+                                                                FreeList,
+                                                                makeArrayRef(IdxNext, 3),
+                                                                "mtce.freelist.nextptr",
+                                                                PullBB);
+    NewFreePtr->setDebugLoc(DebugLoc);
+
+    // Load the next free item address
+    LoadInst *NewFree = new LoadInst(NewFreePtr,
+                                     "mtce.newfreelist",
+                                     false,
+                                     DL.getABITypeAlignment(ListTy->getPointerTo()),
+                                     PullBB);
+    NewFree->setDebugLoc(DebugLoc);
+
+    // Create branches
+    BranchInst *BI = BranchInst::Create(CallingBB, PullBB);
+    BI->setDebugLoc(DebugLoc);
+
+    InsertPos = BasicBlock::iterator(CallingBB->getFirstNonPHI());
+    PHINode *AllocatedPHI = PHINode::Create(ListTy->getPointerTo(), 2,
+                                            "mtce.allocated",
+                                            &*InsertPos);
+    AllocatedPHI->setDebugLoc(DebugLoc);
+
+    // Replace all remaining uses of AI with the new PHInode
+    for (auto i = AI->use_begin(), e = AI->use_end(); i != e; /* in loop */) {
+      Use &U = *(i++);
+      if (Instruction *I = dyn_cast<Instruction>(U.getUser())) {
+        if (I->getParent() == CallingBB)
+          U.set(AllocatedPHI);
+      }
+    }
+
+    AllocatedPHI->addIncoming(AI, AllocaBB);
+    AllocatedPHI->addIncoming(FreeList, PullBB);
+
+    PHINode *UpdatedFreeList = PHINode::Create(ListTy->getPointerTo(), 2,
+                                               "mtce.freelist.next",
+                                               &*InsertPos);
+    UpdatedFreeList->setDebugLoc(DebugLoc);
+    UpdatedFreeList->addIncoming(FreeList, AllocaBB);
+    UpdatedFreeList->addIncoming(NewFree, PullBB);
+    FreeList->addIncoming(UpdatedFreeList, UpdatedFreeList->getParent());
+
+    InsertPos = BasicBlock::iterator(PreCallingBB->getTerminator());
+    ICmpInst *CmpI = new ICmpInst(&*InsertPos,
+                                  ICmpInst::ICMP_EQ,
+                                  FreeList,
+                                  ConstantPointerNull::get(ListTy->getPointerTo()),
+                                  "mtce.cmp_freelist_empty");
+    CmpI->setDebugLoc(DebugLoc);
+
+    BI = BranchInst::Create(AllocaBB, PullBB, CmpI);
+    BI->setDebugLoc(DebugLoc);
+    ReplaceInstWithInst(PreCallingBB->getTerminator(), BI);
+
+    return;
+  }
+
+  /// Eliminate other recursive calls by adding the structures to the itemlist.
+  //
+  // calling.bb:
+  //   ...
+  //   %movable.0 = add ...
+  //   %movable.k = add ...
+  //   br %loopentry
+  //
+  // ->
+  //
+  // calling.bb:
+  //   ...
+  //   %movable.0 = add ...
+  //   %movable.k = add ...
+  //
+  //   ; allocate memory on stack
+  //   %itemptr = alloca %listty, 2
+  //
+  //   ; compute address if list items
+  //   %gep_0 = getelementptr %listty, 0
+  //   ...
+  //   %gep_k = getelementptr %listty, k
+  //
+  //   ; setup and store list item [0]; set mark to false
+  //   mark.0 = <marker instructions>
+  //   %nextptr.0 = getelementptr %listty, itemptr, 0, 1, 0
+  //   store %gep.1, *nextptr.0
+  //   ...
+  //   mark.k = <marker instructions>
+  //   %nextptr.k = getelementptr %listty, itemptr, k, 1, 0
+  //   store %listhead, %nextptr.k
+  //
+  //   ; setup and store operands entries
+  //   %opreands.0 = getelementptr %listty, itemptr, 0, 0
+  //   %arg.0-0 = insertvalue mark_0, undef, %movable.0-0
+  //   store %arg.0-0, %operands.0
+  //   ...
+  //   %opreands.k = getelementptr %listty, itemptr, k, 0
+  //   %arg.k-0 = insertvalue mark_1, undef, %movable.k-0
+  //   store %arg.k-0, %operands.k
+  //
+  //   br %loopentry
+  //
+  // ; below a reference to the source instruction
+  //
+  // H1:
+  //
+  //   %static.0 = call @fn1, %movable.0
+  //   call @recursive ...
+  //
+  //   %static.k = call @fn1, %movable.k
+  //   call @recursive ...
+  //
+  //   br ret
+  void eliminateOtherCalls(CandidateContainer &OtherCandidates,
+                           BasicBlock *CallerBB,
+                           PHINode *ListHead,
+                           PHINode *FreeList) {
+    Instruction *InsertPos = CallerBB->getTerminator();
+    Type *OperandsTy = ListTy->getTypeAtIndex(0U);
+
+    // Add an entry to the list for all candidates except the first. This to
+    // eliminates the calls to those candidates.
+    AllocaInst *AllocaPtr = new AllocaInst(ListTy,
+                                           DL.getAllocaAddrSpace(),
+                                           ConstantInt::get(DL.getIntPtrType(C, DL.getAllocaAddrSpace()), OtherCandidatesNumOf),
+                                           DL.getABITypeAlignment(ListTy),
+                                           "mtce.itemptr",
+                                           InsertPos);
+    AllocaPtr->setDebugLoc(InsertPos->getDebugLoc());
+
+    // Fill list link structure with the necessary values.  With a freelist
+    // available all values except the last nextptr are constant during this
+    // execution instance. The address values are all relative to the allocated
+    // address, and the last mark (if any) also do not need to change when
+    // pulling the element from the freelist. AllocaEnd marks the first
+    // instruction that must be executed when pulling from the freelist.
+    Instruction *AllocaEnd = nullptr;
+    unsigned idx = 0;
+    for (auto &Cand: OtherCandidates) {
+      const DebugLoc &DebugLoc = Cand.getCallInst()->getDebugLoc();
+
+      Value *Idx[3] = {
+        ConstantInt::get(DL.getIntPtrType(C, DL.getAllocaAddrSpace()), Marker->getItemIndex(idx)),
+        ConstantInt::get(IntegerType::get(C, 32), 1),
+        ConstantInt::get(IntegerType::get(C, 32), 0) };
+
+      Instruction *LinkPtrGEP = GetElementPtrInst::CreateInBounds(ListTy,
+                                                                  AllocaPtr,
+                                                                  makeArrayRef(Idx, 2),
+                                                                  "mtce.linkptr." + Twine(idx),
+                                                                  InsertPos);
+      LinkPtrGEP->setDebugLoc(Cand.getCallInst()->getDebugLoc());
+
+      Instruction *NextPtrGEP = GetElementPtrInst::CreateInBounds(ListTy,
+                                                                  AllocaPtr,
+                                                                  makeArrayRef(Idx, 3),
+                                                                  "mtce.nextptr." + Twine(idx),
+                                                                  InsertPos);
+      NextPtrGEP->setDebugLoc(Cand.getCallInst()->getDebugLoc());
+
+      Value *NextVal;
+      if (idx < OtherCandidatesNumOf - 1) {
+        NextVal = Marker->addComputeNextPtr(AllocaPtr, idx + 1, InsertPos,
+                                            Cand.getCallInst()->getDebugLoc(), DL);
+      }
+      else {
+        NextVal = ListHead;
+        AllocaEnd = NextPtrGEP;
+      }
+
+      StoreInst *SNPI = new StoreInst(NextVal,
+                                      NextPtrGEP,
+                                      false,
+                                      DL.getABITypeAlignment(NextVal->getType()),
+                                      InsertPos);
+      SNPI->setDebugLoc(DebugLoc);
+
+      idx++;
+    }
+
+    // Fill the operands part of the ListItems
+    idx = 0;
+    for (auto &Cand: OtherCandidates) {
+      const DebugLoc &DebugLoc = Cand.getCallInst()->getDebugLoc();
+
+      unsigned ValueIdx = 0;
+      Instruction *OperandsVal = nullptr;
+      for (auto &IA: IncomingArguments) {
+        if (!IA.isStacked())
+          continue;
+
+        Value *ValueToStore = IA.getValue(&Cand);
+
+        OperandsVal = InsertValueInst::Create((OperandsVal ? cast<Value>(OperandsVal) : cast<Value>(UndefValue::get(OperandsTy))),
+                                              ValueToStore,
+                                              makeArrayRef(&ValueIdx, 1),
+                                              "mtce.arg." + Twine(idx) + ".Op" + Twine(ValueIdx),
+                                              InsertPos);
+        OperandsVal->setDebugLoc(DebugLoc);
+        ValueIdx++;
+      }
+
+      // OperandsVal might be null if no operands have to be stored in the list.
+      if (OperandsVal) {
+        // Create the GEP that points to the operand struct
+        Value *Idx[2] = {
+          ConstantInt::get(DL.getIntPtrType(C, DL.getAllocaAddrSpace()), Marker->getItemIndex(idx)),
+          ConstantInt::get(IntegerType::get(C, 32), 0) };
+        Instruction *OperandsGEP = GetElementPtrInst::CreateInBounds(ListTy,
+                                                                     AllocaPtr,
+                                                                     makeArrayRef(Idx, 2),
+                                                                     "mtce.operandsgep." + Twine(idx),
+                                                                     InsertPos);
+        OperandsGEP->setDebugLoc(Cand.getCallInst()->getDebugLoc());
+
+        StoreInst *SOVI = new StoreInst(OperandsVal,
+                                        OperandsGEP,
+                                        false,
+                                        DL.getABITypeAlignment(OperandsTy),
+                                        InsertPos);
+        SOVI->setDebugLoc(DebugLoc);
+      }
+
+      idx++;
+    }
+
+    Instruction *NewHead = Marker->addComputeNextPtr(AllocaPtr, 0, InsertPos,
+                                                     OtherCandidates.front().getCallInst()->getDebugLoc(), DL);
+    // Redirect list head by, adjusting listhead PHI Node in the loop entry.
+    ListHead->addIncoming(NewHead,
+                          NewHead->getParent());
+
+    replaceAllocaWithFreeList(AllocaPtr, AllocaEnd, nullptr, FreeList);
+  }
+
+  BasicBlock *insertReturnPath(
+      BasicBlock *MtceReturnBB, Instruction *ListHead,
+      Instruction *FreeList, PHINode *WorkListItemPHI,
+      PHINode *FreeListRecursivePHI) {
+    // If the block is a non-returning block, then the following block is the a
+    // return only block and the branch is an unconditional one. In this case
+    // the branch instruction is replaced by a clone of the return instruction
+    // of the successor BB.
+    if (BranchInst *BI = dyn_cast<BranchInst>(MtceReturnBB->getTerminator())) {
+      BasicBlock *RetOnlyBB = MtceReturnBB->getSingleSuccessor();
+      assert(RetOnlyBB && "Recursion has more than one successor");
+      Instruction *NewTI = RetOnlyBB->getTerminator()->clone();
+      ReplaceInstWithInst(BI, NewTI);
+    }
+
+    const DebugLoc &DebugLoc = MtceReturnBB->getTerminator()->getDebugLoc();
+
+    Instruction *InsertPos;
+    BasicBlock *ReturningBB;
+    std::tie(InsertPos, ReturningBB) = Marker->insertReturnPath(
+        ListHead, FreeList, MtceReturnBB, WorkListItemPHI,
+        FreeListRecursivePHI, DebugLoc, F.getParent()->getDataLayout());
+
+    // If a basic block for freelist management was added, add the corresponding
+    // Instructions.
+    if (InsertPos) {
+      // Compute address of bulk allocation.
+      signed IdxOfFirstItem = -(signed)Marker->getItemIndex(OtherCandidatesNumOf - 1);
+      Value *IdxBulk[] = {
+        ConstantInt::get(DL.getIntPtrType(C, DL.getAllocaAddrSpace()), IdxOfFirstItem),
+      };
+
+      Instruction *FreeListAdded = GetElementPtrInst::CreateInBounds(
+          ListTy, ListHead, makeArrayRef(IdxBulk, 1), "mtce.freelist.added", InsertPos);
+      FreeListAdded->setDebugLoc(DebugLoc);
+
+      // Adjust Freelist PHI
+      FreeListRecursivePHI->addIncoming(FreeListAdded, InsertPos->getParent());
+
+      // Store the old list head into the last element of the item. This is done
+      // after potentially splitting of the conditional branch, to avoid messing
+      // up with `NewFreeHead->replaceAllUsesWith(FreeNextPHI);`, we want to use
+      // this the original value here.
+
+      // Compute the address of the last item in the block
+      Value *IdxNext[] = {
+        ConstantInt::get(DL.getIntPtrType(C, DL.getAllocaAddrSpace()),
+                         Marker->getItemIndex(OtherCandidatesNumOf - 1)),
+        ConstantInt::get(IntegerType::get(C, 32), 1),
+        ConstantInt::get(IntegerType::get(C, 32), 0),
+      };
+      Instruction *FreeNext = GetElementPtrInst::CreateInBounds(ListTy,
+                                                                FreeListAdded,
+                                                                makeArrayRef(IdxNext, 3),
+                                                                "mtce.freenextptr",
+                                                                &*InsertPos);
+      FreeNext->setDebugLoc(DebugLoc);
+
+      StoreInst *SI = new StoreInst(FreeList,
+                                    FreeNext,
+                                    false,
+                                    DL.getABITypeAlignment(ListTy->getPointerTo()),
+                                    &*InsertPos);
+      SI->setDebugLoc(DebugLoc);
+    }
+
+    return ReturningBB;
+  }
+
+  template <typename ArgumentPHIContainer>
+  std::tuple<PHINode*, PHINode*> createRecursiveLoopEntry(
+      BasicBlock *LoopEntry, CandidateContainer &OtherCandidates,
+      PHINode *ListHead, PHINode *FreeList,
+      const ArgumentPHIContainer &ArgumentPHIs) {
+    RecursiveCall &CC = OtherCandidates.front();
+    const DebugLoc &DebugLoc = CC.getCallInst()->getDebugLoc();
+
+    // Create the basic block for the loop entry, when using an item from the
+    // worklist.
+    BasicBlock *EntryBB = BasicBlock::Create(C, "mtce.loop.from.worklist", &F, LoopEntry);
+
+    // Create a worklist item as PHINode that contains the referenced list item.
+    PHINode *WLItem = PHINode::Create(ListTy, 1, "mtce.worklist.item", EntryBB);
+    WLItem->setDebugLoc(DebugLoc);
+    PHINode *FreeListNext = PHINode::Create(ListTy->getPointerTo(), 1, "mtce.freelist.next", EntryBB);
+    FreeListNext->setDebugLoc(DebugLoc);
+
+    unsigned Idx[2] = { 1, 0 };
+    Instruction *WLNext = ExtractValueInst::Create(
+        WLItem, makeArrayRef(Idx, 2), "mtce.wlnext", EntryBB);
+
+    unsigned IncomingCount = IncomingArguments.size();
+    SmallVector<Value*, 4> Incoming;
+    Incoming.reserve(IncomingCount);
+    Idx[0] = 0;
+    for (IncomingArgument &IA : IncomingArguments) {
+      Value *V = IA.getValue();
+      if (IA.isConstant() || IA.isStatic()) {
+        Incoming.push_back(V);
+      }
+      else {
+        Instruction *EVI = ExtractValueInst::Create(
+            WLItem, makeArrayRef(Idx, 2), "mtce.op." + Twine(Idx[1]), EntryBB);
+        EVI->setDebugLoc(DebugLoc);
+        Incoming.push_back(EVI);
+        Idx[1]++;
+      }
+    }
+
+    // Clone instructions from function H back into the code sequence.
+    Function *FunctionH = CC.getFunctionH();
+    assert(FunctionH && "FunctionH not present.");
+
+    CallInst *CI = nullptr;
+    std::map<Value*, Value*> VVMap;
+    for (Instruction &I: FunctionH->getEntryBlock()) {
+      if (&I == FunctionH->getEntryBlock().getTerminator())
+        continue;
+
+      Instruction *NewI = I.clone();
+      VVMap[&I] = NewI;
+      EntryBB->getInstList().push_back(NewI);
+      CI = dyn_cast<CallInst>(NewI);
+
+      for (Use &U: NewI->operands()) {
+        Value *V = U.get();
+
+        auto VVMI = VVMap.find(V);
+
+        // For Argument we replace the operand with the corresponding item in Incoming.
+        if (Argument *A = dyn_cast<Argument>(V)) {
+          U.set(Incoming[A->getArgNo()]);
+        }
+        // For Values within the Value-Value map use VVMap to resolve the relation.
+        else if (VVMI != VVMap.end()) {
+          U.set(VVMI->second);
+        }
+        // Otherwise it must be a constant that can be left unchanged.
+        else {
+          assert(isa<Constant>(V) && "could not transform. Not a constant");
+        }
+      }
+    }
+    assert(CI && CI->getCalledFunction() == &F && "Last instruction in FunctionH was not a recursive CallInst");
+
+    // Branch into loop.
+    BranchInst *BI = BranchInst::Create(LoopEntry, EntryBB);
+    BI->setDebugLoc(DebugLoc);
+
+    // Adjust the List PHI Node in the loop entry.
+    ListHead->addIncoming(WLNext, EntryBB);
+
+    // Adjust the List PHI Node in the loop entry.
+    FreeList->addIncoming(FreeListNext, EntryBB);
+
+    // Adjust Argument PHI Nodes in the loop entry.
+    adjustPHIsFromCallInst(EntryBB, CI, ArgumentPHIs);
+
+    // Finally erase the CallInst.
+    CI->eraseFromParent();
+
+    // Remove the call instructions, which got eliminated
+    for (auto &Cand: OtherCandidates) {
+      Cand.eraseSourceInstructions();
+    }
+
+    return std::make_tuple(WLItem, FreeListNext);
+  }
+
+  /// Redirect all returning blocks into the the new return block.
+  void redirectRemainingReturnBlocks(BasicBlock *PullBB, BasicBlock *NewReturnBB) {
+    for (auto &BB: F) {
+      if (&BB == NewReturnBB)
+        continue;
+
+      if (ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator())) {
+        BranchInst *BI = BranchInst::Create(PullBB, RI);
+        BI->setDebugLoc(RI->getDebugLoc());
+        RI->eraseFromParent();
+      }
+    }
+  }
+
+  /// Call the transformation functions above to apply the transformation.
+  void eliminateTailCalls() {
+    SmallVector<PHINode *, 4> ArgumentPHIs;
+    PHINode *ListHead;
+    PHINode *FreeList;
+    BasicBlock *LoopEntry;
+
+    std::tie(ListHead, FreeList, LoopEntry) = prepareEntry(ArgumentPHIs);
+
+    BasicBlock *CallerBB, *MtceReturnBB;
+    std::tie(CallerBB, MtceReturnBB) =
+      eliminateFirstCall(FirstCandidate.front(), LoopEntry, ArgumentPHIs);
+
+    eliminateOtherCalls(
+        OtherCandidates,
+        CallerBB,
+        ListHead,
+        FreeList);
+
+    // Create a loop entry when executing an item from the worklist. This
+    // function will also remove the remaining static instructions.
+    PHINode *WorkListItemPHI;
+    PHINode *FreeListRecursivePHI;
+    std::tie(WorkListItemPHI, FreeListRecursivePHI) =
+      createRecursiveLoopEntry(LoopEntry, OtherCandidates,
+                               ListHead, FreeList,
+                               ArgumentPHIs);
+
+    BasicBlock *ReturningBB = insertReturnPath(MtceReturnBB, ListHead, FreeList, WorkListItemPHI, FreeListRecursivePHI);
+
+    redirectRemainingReturnBlocks(MtceReturnBB, ReturningBB);
+
+    // What left is to simplify, the added PHINodes. As we do not use
+    // ArgumentPHIs anymore. This list is reused for the simplification loop
+    // below.
+    ArgumentPHIs.push_back(WorkListItemPHI);
+    ArgumentPHIs.push_back(FreeListRecursivePHI);
+    ArgumentPHIs.push_back(ListHead);
+    ArgumentPHIs.push_back(FreeList);
+
+    for (PHINode *PN : ArgumentPHIs) {
+      // If the PHI Node is a dynamic constant, replace it with the value it is.
+      if (Value *PNV = SimplifyInstruction(PN, F.getParent()->getDataLayout())) {
+        PN->replaceAllUsesWith(PNV);
+        PN->eraseFromParent();
+      }
+    }
+
+    ORE.emit([&]() {
+        return  OptimizationRemark(DEBUG_TYPE, "Multiple tail calls eliminated",
+                                   F.getEntryBlock().front().getDebugLoc(), &F.getEntryBlock())
+          << "Calls modeled with: " << ore::NV("Width", OtherCandidatesNumOf + 1);
+      });
+  }
+};
+
+/// Wrapper class for legacy pass manager.
+struct MultiTailCallElimination : public FunctionPass {
+  static char ID;
+  MultiTailCallElimination() : FunctionPass(ID) {
+    initializeMultiTailCallEliminationPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (DisableMultiTailCallElimination)
+      return false;
+
+    if (skipFunction(F))
+      return false;
+
+    return MultiTailCallEliminationImpl(
+        F,
+        getAnalysis<AAResultsWrapperPass>().getAAResults(),
+        getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F),
+        getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE()).run();
+  }
+};
+}
+
+char MultiTailCallElimination::ID = 0;
+
+INITIALIZE_PASS_BEGIN(MultiTailCallElimination, DEBUG_TYPE, "Multi Tail Call Elimination",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+INITIALIZE_PASS_END(MultiTailCallElimination, DEBUG_TYPE, "Multi Tail Call Elimination",
+                    false, false)
+
+/// Public interface to the MultiTailCallEliminationination pass.
+FunctionPass *llvm::createMultiTailCallEliminationPass() {
+  return new MultiTailCallElimination();
+}
+
+/// Wrapper class for experimental pass manager.
+PreservedAnalyses MultiTailCallEliminationPass::run(Function &F,
+                                                     FunctionAnalysisManager &AM) {
+  if (DisableMultiTailCallElimination)
+    return PreservedAnalyses::all();
+
+  bool Changed = MultiTailCallEliminationImpl(
+      F,
+      AM.getResult<AAManager>(F),
+      AM.getResult<TargetIRAnalysis>(F),
+      AM.getResult<OptimizationRemarkEmitterAnalysis>(F)).run();
+
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserve<GlobalsAA>();
+
+  return PA;
+}
Index: lib/Transforms/Scalar/Scalar.cpp
===================================================================
--- lib/Transforms/Scalar/Scalar.cpp
+++ lib/Transforms/Scalar/Scalar.cpp
@@ -82,6 +82,7 @@
   initializeMemCpyOptLegacyPassPass(Registry);
   initializeMergeICmpsPass(Registry);
   initializeMergedLoadStoreMotionLegacyPassPass(Registry);
+  initializeMultiTailCallEliminationPass(Registry);
   initializeNaryReassociateLegacyPassPass(Registry);
   initializePartiallyInlineLibCallsLegacyPassPass(Registry);
   initializeReassociateLegacyPassPass(Registry);
@@ -237,6 +238,10 @@
   unwrap(PM)->add(createTailCallEliminationPass());
 }
 
+void LLVMAddMultiTailCallEliminationinationPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createMultiTailCallEliminationPass());
+}
+
 void LLVMAddConstantPropagationPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createConstantPropagationPass());
 }
Index: test/Transforms/MultiTailCallElimination/MultiTailCallElimination.ll
===================================================================
--- /dev/null
+++ test/Transforms/MultiTailCallElimination/MultiTailCallElimination.ll
@@ -0,0 +1,374 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -multi-tce -S | FileCheck %s
+
+define void @simple2(i32 %a1, i32 %a2, i32 %a3) {
+; CHECK-LABEL: @simple2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[MTCE_LOOP:%.*]]
+; CHECK:       mtce.loop.from.worklist:
+; CHECK-NEXT:    [[MTCE_WLNEXT:%.*]] = extractvalue [[MTCE_LISTTY:%.*]] %mtce.mload, 1, 0
+; CHECK-NEXT:    [[MTCE_OP_0:%.*]] = extractvalue [[MTCE_LISTTY]] %mtce.mload, 0, 0
+; CHECK-NEXT:    br label [[MTCE_LOOP]]
+; CHECK:       mtce.loop:
+; CHECK-NEXT:    [[MTCE_LISTHEAD:%.*]] = phi %mtce.listty* [ null, [[ENTRY:%.*]] ], [ [[MTCE_NEXTPTR_FOR_0:%.*]], [[MTCE_LOOP_1:%.*]] ], [ [[MTCE_WLNEXT]], [[MTCE_LOOP_FROM_WORKLIST:%.*]] ]
+; CHECK-NEXT:    [[MTCE_FREELIST:%.*]] = phi %mtce.listty* [ null, [[ENTRY]] ], [ [[MTCE_FREELIST_NEXT:%.*]], [[MTCE_LOOP_1]] ], [ [[MTCE_FREELIST_ADDED:%.*]], [[MTCE_LOOP_FROM_WORKLIST]] ]
+; CHECK-NEXT:    [[MTCE_A1:%.*]] = phi i32 [ [[A1:%.*]], [[ENTRY]] ], [ 1, [[MTCE_LOOP_1]] ], [ [[MTCE_OP_0]], [[MTCE_LOOP_FROM_WORKLIST]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[MTCE_A1]], [[A2:%.*]]
+; CHECK-NEXT:    [[MTCE_CMP_FREELIST_EMPTY:%.*]] = icmp eq %mtce.listty* [[MTCE_FREELIST]], null
+; CHECK-NEXT:    br i1 [[MTCE_CMP_FREELIST_EMPTY]], label [[MTCE_ALLOCA:%.*]], label [[MTCE_ALLOCFROMFREE:%.*]]
+; CHECK:       mtce.alloca:
+; CHECK-NEXT:    [[MTCE_ITEMPTR:%.*]] = alloca [[MTCE_LISTTY]], i64 1, align 8
+; CHECK-NEXT:    [[MTCE_LINKPTR_0:%.*]] = getelementptr inbounds [[MTCE_LISTTY]], %mtce.listty* [[MTCE_ITEMPTR]], i64 0, i32 1
+; CHECK-NEXT:    br label [[MTCE_LOOP_1]]
+; CHECK:       mtce.allocfromfree:
+; CHECK-NEXT:    [[MTCE_PULLEDITEMGEP_0:%.*]] = getelementptr inbounds [[MTCE_LISTTY]], %mtce.listty* [[MTCE_FREELIST]], i64 0
+; CHECK-NEXT:    [[MTCE_FREELIST_NEXTPTR:%.*]] = getelementptr inbounds [[MTCE_LISTTY]], %mtce.listty* [[MTCE_FREELIST]], i64 0, i32 1, i32 0
+; CHECK-NEXT:    [[MTCE_NEWFREELIST:%.*]] = load %mtce.listty*, %mtce.listty** [[MTCE_FREELIST_NEXTPTR]], align 8
+; CHECK-NEXT:    br label [[MTCE_LOOP_1]]
+; CHECK:       mtce.loop.1:
+; CHECK-NEXT:    [[MTCE_ALLOCATED:%.*]] = phi %mtce.listty* [ [[MTCE_ITEMPTR]], [[MTCE_ALLOCA]] ], [ [[MTCE_FREELIST]], [[MTCE_ALLOCFROMFREE]] ]
+; CHECK-NEXT:    [[MTCE_FREELIST_NEXT]] = phi %mtce.listty* [ [[MTCE_FREELIST]], [[MTCE_ALLOCA]] ], [ [[MTCE_NEWFREELIST]], [[MTCE_ALLOCFROMFREE]] ]
+; CHECK-NEXT:    [[MTCE_NEXTPTR_0:%.*]] = getelementptr inbounds [[MTCE_LISTTY]], %mtce.listty* [[MTCE_ALLOCATED]], i64 0, i32 1, i32 0
+; CHECK-NEXT:    store %mtce.listty* [[MTCE_LISTHEAD]], %mtce.listty** [[MTCE_NEXTPTR_0]], align 8
+; CHECK-NEXT:    [[MTCE_ARG_0_OP0:%.*]] = insertvalue { i32 } undef, i32 [[TMP0]], 0
+; CHECK-NEXT:    [[MTCE_OPERANDSGEP_0:%.*]] = getelementptr inbounds [[MTCE_LISTTY]], %mtce.listty* [[MTCE_ALLOCATED]], i64 0, i32 0
+; CHECK-NEXT:    store { i32 } [[MTCE_ARG_0_OP0]], { i32 }* [[MTCE_OPERANDSGEP_0]], align 4
+; CHECK-NEXT:    [[MTCE_NEXTPTR_FOR_0]] = getelementptr inbounds [[MTCE_LISTTY]], %mtce.listty* [[MTCE_ALLOCATED]], i64 0
+; CHECK-NEXT:    br label [[MTCE_LOOP]]
+; CHECK:       mtce.return:
+; CHECK-NEXT:    [[MTCE_WORKLIST_IS_NULL_TM:%.*]] = icmp eq %mtce.listty* [[MTCE_LISTHEAD]], null
+; CHECK-NEXT:    br i1 [[MTCE_WORKLIST_IS_NULL_TM]], label [[MTCE_RETONLY:%.*]], label [[MTCE_ADD_FREELIST:%.*]]
+; CHECK:       mtce.add.freelist:
+; CHECK-NEXT:    [[MTCE_MLOAD:%.*]] = load [[MTCE_LISTTY]], %mtce.listty* [[MTCE_LISTHEAD]], align 8
+; CHECK-NEXT:    [[MTCE_FREELIST_ADDED]] = getelementptr inbounds [[MTCE_LISTTY]], %mtce.listty* [[MTCE_LISTHEAD]], i64 0
+; CHECK-NEXT:    [[MTCE_FREENEXTPTR:%.*]] = getelementptr inbounds [[MTCE_LISTTY]], %mtce.listty* [[MTCE_FREELIST_ADDED]], i64 0, i32 1, i32 0
+; CHECK-NEXT:    store %mtce.listty* [[MTCE_FREELIST]], %mtce.listty** [[MTCE_FREENEXTPTR]], align 8
+; CHECK-NEXT:    br label [[MTCE_LOOP_FROM_WORKLIST]]
+; CHECK:       mtce.retonly:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = add i32 %a1, %a2
+  call void @simple2(i32 1, i32 %a2, i32 %a3)
+  call void @simple2(i32 %0, i32 %a2, i32 %a3)
+  ret void
+}
+
+define void @simple4_with_const(i32 %a1, i32 %a2, i32 %a3) {
+; CHECK-LABEL: @simple4_with_const(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[MTCE_LOOP:%.*]]
+; CHECK:       mtce.loop.from.worklist:
+; CHECK-NEXT:    [[MTCE_WORKLIST_ITEM:%.*]] = phi [[MTCE_LISTTY_0:%.*]] [ [[MTCE_UMLOAD:%.*]], [[MTCE_UNMARKED:%.*]] ], [ [[MTCE_MLOAD:%.*]], [[MTCE_ADD_FREELIST:%.*]] ]
+; CHECK-NEXT:    [[MTCE_FREELIST_NEXT2:%.*]] = phi %mtce.listty.0* [ [[MTCE_FREELIST:%.*]], [[MTCE_UNMARKED]] ], [ [[MTCE_FREELIST_ADDED:%.*]], [[MTCE_ADD_FREELIST]] ]
+; CHECK-NEXT:    [[MTCE_WLNEXT:%.*]] = extractvalue [[MTCE_LISTTY_0]] %mtce.worklist.item, 1, 0
+; CHECK-NEXT:    [[MTCE_OP_0:%.*]] = extractvalue [[MTCE_LISTTY_0]] %mtce.worklist.item, 0, 0
+; CHECK-NEXT:    br label [[MTCE_LOOP]]
+; CHECK:       mtce.loop:
+; CHECK-NEXT:    [[MTCE_LISTHEAD:%.*]] = phi %mtce.listty.0* [ null, [[ENTRY:%.*]] ], [ [[MTCE_TAGGED_NEXT_FOR_0:%.*]], [[MTCE_LOOP_1:%.*]] ], [ [[MTCE_WLNEXT]], [[MTCE_LOOP_FROM_WORKLIST:%.*]] ]
+; CHECK-NEXT:    [[MTCE_FREELIST]] = phi %mtce.listty.0* [ null, [[ENTRY]] ], [ [[MTCE_FREELIST_NEXT:%.*]], [[MTCE_LOOP_1]] ], [ [[MTCE_FREELIST_NEXT2]], [[MTCE_LOOP_FROM_WORKLIST]] ]
+; CHECK-NEXT:    [[MTCE_A1:%.*]] = phi i32 [ [[A1:%.*]], [[ENTRY]] ], [ 1, [[MTCE_LOOP_1]] ], [ [[MTCE_OP_0]], [[MTCE_LOOP_FROM_WORKLIST]] ]
+; CHECK-NEXT:    [[MTCE_CMP_FREELIST_EMPTY:%.*]] = icmp eq %mtce.listty.0* [[MTCE_FREELIST]], null
+; CHECK-NEXT:    br i1 [[MTCE_CMP_FREELIST_EMPTY]], label [[MTCE_ALLOCA:%.*]], label [[MTCE_ALLOCFROMFREE:%.*]]
+; CHECK:       mtce.alloca:
+; CHECK-NEXT:    [[MTCE_ITEMPTR:%.*]] = alloca [[MTCE_LISTTY_0]], i64 3, align 8
+; CHECK-NEXT:    [[MTCE_LINKPTR_0:%.*]] = getelementptr inbounds [[MTCE_LISTTY_0]], %mtce.listty.0* [[MTCE_ITEMPTR]], i64 0, i32 1
+; CHECK-NEXT:    [[MTCE_NEXTPTR_0:%.*]] = getelementptr inbounds [[MTCE_LISTTY_0]], %mtce.listty.0* [[MTCE_ITEMPTR]], i64 0, i32 1, i32 0
+; CHECK-NEXT:    [[MTCE_NEXTPTR_FOR_1:%.*]] = getelementptr inbounds [[MTCE_LISTTY_0]], %mtce.listty.0* [[MTCE_ITEMPTR]], i64 1
+; CHECK-NEXT:    [[MTCE_PTOI_1:%.*]] = ptrtoint %mtce.listty.0* [[MTCE_NEXTPTR_FOR_1]] to i64
+; CHECK-NEXT:    [[MTCE_LISTHEAD_MSKED_INTPTR:%.*]] = or i64 [[MTCE_PTOI_1]], 1
+; CHECK-NEXT:    [[MTCE_TAGGED_NEXT_FOR_1:%.*]] = inttoptr i64 [[MTCE_LISTHEAD_MSKED_INTPTR]] to %mtce.listty.0*
+; CHECK-NEXT:    store %mtce.listty.0* [[MTCE_TAGGED_NEXT_FOR_1]], %mtce.listty.0** [[MTCE_NEXTPTR_0]], align 8
+; CHECK-NEXT:    [[MTCE_LINKPTR_1:%.*]] = getelementptr inbounds [[MTCE_LISTTY_0]], %mtce.listty.0* [[MTCE_ITEMPTR]], i64 1, i32 1
+; CHECK-NEXT:    [[MTCE_NEXTPTR_1:%.*]] = getelementptr inbounds [[MTCE_LISTTY_0]], %mtce.listty.0* [[MTCE_ITEMPTR]], i64 1, i32 1, i32 0
+; CHECK-NEXT:    [[MTCE_NEXTPTR_FOR_2:%.*]] = getelementptr inbounds [[MTCE_LISTTY_0]], %mtce.listty.0* [[MTCE_ITEMPTR]], i64 2
+; CHECK-NEXT:    store %mtce.listty.0* [[MTCE_NEXTPTR_FOR_2]], %mtce.listty.0** [[MTCE_NEXTPTR_1]], align 8
+; CHECK-NEXT:    [[MTCE_LINKPTR_2:%.*]] = getelementptr inbounds [[MTCE_LISTTY_0]], %mtce.listty.0* [[MTCE_ITEMPTR]], i64 2, i32 1
+; CHECK-NEXT:    br label [[MTCE_LOOP_1]]
+; CHECK:       mtce.allocfromfree:
+; CHECK-NEXT:    [[MTCE_PULLEDITEMGEP_0:%.*]] = getelementptr inbounds [[MTCE_LISTTY_0]], %mtce.listty.0* [[MTCE_FREELIST]], i64 0
+; CHECK-NEXT:    [[MTCE_FREELIST_NEXTPTR:%.*]] = getelementptr inbounds [[MTCE_LISTTY_0]], %mtce.listty.0* [[MTCE_FREELIST]], i64 2, i32 1, i32 0
+; CHECK-NEXT:    [[MTCE_NEWFREELIST:%.*]] = load %mtce.listty.0*, %mtce.listty.0** [[MTCE_FREELIST_NEXTPTR]], align 8
+; CHECK-NEXT:    br label [[MTCE_LOOP_1]]
+; CHECK:       mtce.loop.1:
+; CHECK-NEXT:    [[MTCE_ALLOCATED:%.*]] = phi %mtce.listty.0* [ [[MTCE_ITEMPTR]], [[MTCE_ALLOCA]] ], [ [[MTCE_FREELIST]], [[MTCE_ALLOCFROMFREE]] ]
+; CHECK-NEXT:    [[MTCE_FREELIST_NEXT]] = phi %mtce.listty.0* [ [[MTCE_FREELIST]], [[MTCE_ALLOCA]] ], [ [[MTCE_NEWFREELIST]], [[MTCE_ALLOCFROMFREE]] ]
+; CHECK-NEXT:    [[MTCE_NEXTPTR_2:%.*]] = getelementptr inbounds [[MTCE_LISTTY_0]], %mtce.listty.0* [[MTCE_ALLOCATED]], i64 2, i32 1, i32 0
+; CHECK-NEXT:    store %mtce.listty.0* [[MTCE_LISTHEAD]], %mtce.listty.0** [[MTCE_NEXTPTR_2]], align 8
+; CHECK-NEXT:    [[MTCE_ARG_0_OP0:%.*]] = insertvalue { i32 } undef, i32 2, 0
+; CHECK-NEXT:    [[MTCE_OPERANDSGEP_0:%.*]] = getelementptr inbounds [[MTCE_LISTTY_0]], %mtce.listty.0* [[MTCE_ALLOCATED]], i64 0, i32 0
+; CHECK-NEXT:    store { i32 } [[MTCE_ARG_0_OP0]], { i32 }* [[MTCE_OPERANDSGEP_0]], align 4
+; CHECK-NEXT:    [[MTCE_ARG_1_OP0:%.*]] = insertvalue { i32 } undef, i32 3, 0
+; CHECK-NEXT:    [[MTCE_OPERANDSGEP_1:%.*]] = getelementptr inbounds [[MTCE_LISTTY_0]], %mtce.listty.0* [[MTCE_ALLOCATED]], i64 1, i32 0
+; CHECK-NEXT:    store { i32 } [[MTCE_ARG_1_OP0]], { i32 }* [[MTCE_OPERANDSGEP_1]], align 4
+; CHECK-NEXT:    [[MTCE_ARG_2_OP0:%.*]] = insertvalue { i32 } undef, i32 4, 0
+; CHECK-NEXT:    [[MTCE_OPERANDSGEP_2:%.*]] = getelementptr inbounds [[MTCE_LISTTY_0]], %mtce.listty.0* [[MTCE_ALLOCATED]], i64 2, i32 0
+; CHECK-NEXT:    store { i32 } [[MTCE_ARG_2_OP0]], { i32 }* [[MTCE_OPERANDSGEP_2]], align 4
+; CHECK-NEXT:    [[MTCE_NEXTPTR_FOR_0:%.*]] = getelementptr inbounds [[MTCE_LISTTY_0]], %mtce.listty.0* [[MTCE_ALLOCATED]], i64 0
+; CHECK-NEXT:    [[MTCE_PTOI_0:%.*]] = ptrtoint %mtce.listty.0* [[MTCE_NEXTPTR_FOR_0]] to i64
+; CHECK-NEXT:    [[MTCE_LISTHEAD_MSKED_INTPTR1:%.*]] = or i64 [[MTCE_PTOI_0]], 1
+; CHECK-NEXT:    [[MTCE_TAGGED_NEXT_FOR_0]] = inttoptr i64 [[MTCE_LISTHEAD_MSKED_INTPTR1]] to %mtce.listty.0*
+; CHECK-NEXT:    br label [[MTCE_LOOP]]
+; CHECK:       mtce.return:
+; CHECK-NEXT:    [[MTCE_PTOI:%.*]] = ptrtoint %mtce.listty.0* [[MTCE_LISTHEAD]] to i64
+; CHECK-NEXT:    [[MTCE_MARK:%.*]] = trunc i64 [[MTCE_PTOI]] to i1
+; CHECK-NEXT:    [[MTCE_MASKED_WORKLIST_INTPTR:%.*]] = and i64 [[MTCE_PTOI]], -2
+; CHECK-NEXT:    [[MTCE_MASKED_WORKLIST:%.*]] = inttoptr i64 [[MTCE_MASKED_WORKLIST_INTPTR]] to %mtce.listty.0*
+; CHECK-NEXT:    br i1 [[MTCE_MARK]], label [[MTCE_UNMARKED]], label [[MTCE_CHECK_RETURN:%.*]]
+; CHECK:       mtce.unmarked:
+; CHECK-NEXT:    [[MTCE_UMLOAD]] = load [[MTCE_LISTTY_0]], %mtce.listty.0* [[MTCE_MASKED_WORKLIST]], align 8
+; CHECK-NEXT:    br label [[MTCE_LOOP_FROM_WORKLIST]]
+; CHECK:       mtce.check.return:
+; CHECK-NEXT:    [[MTCE_WORKLIST_IS_NULL_TG:%.*]] = icmp eq %mtce.listty.0* [[MTCE_LISTHEAD]], null
+; CHECK-NEXT:    br i1 [[MTCE_WORKLIST_IS_NULL_TG]], label [[MTCE_RETONLY:%.*]], label [[MTCE_ADD_FREELIST]]
+; CHECK:       mtce.add.freelist:
+; CHECK-NEXT:    [[MTCE_MLOAD]] = load [[MTCE_LISTTY_0]], %mtce.listty.0* [[MTCE_LISTHEAD]], align 8
+; CHECK-NEXT:    [[MTCE_FREELIST_ADDED]] = getelementptr inbounds [[MTCE_LISTTY_0]], %mtce.listty.0* [[MTCE_LISTHEAD]], i64 -2
+; CHECK-NEXT:    [[MTCE_FREENEXTPTR:%.*]] = getelementptr inbounds [[MTCE_LISTTY_0]], %mtce.listty.0* [[MTCE_FREELIST_ADDED]], i64 2, i32 1, i32 0
+; CHECK-NEXT:    store %mtce.listty.0* [[MTCE_FREELIST]], %mtce.listty.0** [[MTCE_FREENEXTPTR]], align 8
+; CHECK-NEXT:    br label [[MTCE_LOOP_FROM_WORKLIST]]
+; CHECK:       mtce.retonly:
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @simple4_with_const(i32 1, i32 %a2, i32 %a3)
+  call void @simple4_with_const(i32 2, i32 %a2, i32 %a3)
+  call void @simple4_with_const(i32 3, i32 %a2, i32 %a3)
+  call void @simple4_with_const(i32 4, i32 %a2, i32 %a3)
+  ret void
+}
+
+; Traverse a full quad-tree where the load is executed first
+%quadtree_ty = type { [ 4 x %quadtree_ty* ] }
+define void @quadtree(%quadtree_ty* %this, i32 %a2, i32 %a3) {
+; CHECK-LABEL: @quadtree(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[MTCE_LOOP:%.*]]
+; CHECK:       mtce.loop.from.worklist:
+; CHECK-NEXT:    [[MTCE_WORKLIST_ITEM:%.*]] = phi [[MTCE_LISTTY_1:%.*]] [ [[MTCE_UMLOAD:%.*]], [[MTCE_UNMARKED:%.*]] ], [ [[MTCE_MLOAD:%.*]], [[MTCE_ADD_FREELIST:%.*]] ]
+; CHECK-NEXT:    [[MTCE_FREELIST_NEXT2:%.*]] = phi %mtce.listty.1* [ [[MTCE_FREELIST:%.*]], [[MTCE_UNMARKED]] ], [ [[MTCE_FREELIST_ADDED:%.*]], [[MTCE_ADD_FREELIST]] ]
+; CHECK-NEXT:    [[MTCE_WLNEXT:%.*]] = extractvalue [[MTCE_LISTTY_1]] %mtce.worklist.item, 1, 0
+; CHECK-NEXT:    [[MTCE_OP_0:%.*]] = extractvalue [[MTCE_LISTTY_1]] %mtce.worklist.item, 0, 0
+; CHECK-NEXT:    br label [[MTCE_LOOP]]
+; CHECK:       mtce.loop:
+; CHECK-NEXT:    [[MTCE_LISTHEAD:%.*]] = phi %mtce.listty.1* [ null, [[ENTRY:%.*]] ], [ [[MTCE_TAGGED_NEXT_FOR_0:%.*]], [[MTCE_LOOP_1:%.*]] ], [ [[MTCE_WLNEXT]], [[MTCE_LOOP_FROM_WORKLIST:%.*]] ]
+; CHECK-NEXT:    [[MTCE_FREELIST]] = phi %mtce.listty.1* [ null, [[ENTRY]] ], [ [[MTCE_FREELIST_NEXT:%.*]], [[MTCE_LOOP_1]] ], [ [[MTCE_FREELIST_NEXT2]], [[MTCE_LOOP_FROM_WORKLIST]] ]
+; CHECK-NEXT:    [[MTCE_THIS:%.*]] = phi %quadtree_ty* [ [[THIS:%.*]], [[ENTRY]] ], [ [[C0:%.*]], [[MTCE_LOOP_1]] ], [ [[MTCE_OP_0]], [[MTCE_LOOP_FROM_WORKLIST]] ]
+; CHECK-NEXT:    [[C0PTR:%.*]] = getelementptr inbounds [[QUADTREE_TY:%.*]], %quadtree_ty* [[MTCE_THIS]], i64 0, i32 0, i32 0
+; CHECK-NEXT:    [[C0]] = load %quadtree_ty*, %quadtree_ty** [[C0PTR]]
+; CHECK-NEXT:    [[C1PTR:%.*]] = getelementptr inbounds [[QUADTREE_TY]], %quadtree_ty* [[MTCE_THIS]], i64 0, i32 0, i32 1
+; CHECK-NEXT:    [[C1:%.*]] = load %quadtree_ty*, %quadtree_ty** [[C1PTR]]
+; CHECK-NEXT:    [[C2PTR:%.*]] = getelementptr inbounds [[QUADTREE_TY]], %quadtree_ty* [[MTCE_THIS]], i64 0, i32 0, i32 2
+; CHECK-NEXT:    [[C2:%.*]] = load %quadtree_ty*, %quadtree_ty** [[C2PTR]]
+; CHECK-NEXT:    [[C3PTR:%.*]] = getelementptr inbounds [[QUADTREE_TY]], %quadtree_ty* [[MTCE_THIS]], i64 0, i32 0, i32 3
+; CHECK-NEXT:    [[C3:%.*]] = load %quadtree_ty*, %quadtree_ty** [[C3PTR]]
+; CHECK-NEXT:    [[MTCE_CMP_FREELIST_EMPTY:%.*]] = icmp eq %mtce.listty.1* [[MTCE_FREELIST]], null
+; CHECK-NEXT:    br i1 [[MTCE_CMP_FREELIST_EMPTY]], label [[MTCE_ALLOCA:%.*]], label [[MTCE_ALLOCFROMFREE:%.*]]
+; CHECK:       mtce.alloca:
+; CHECK-NEXT:    [[MTCE_ITEMPTR:%.*]] = alloca [[MTCE_LISTTY_1]], i64 3, align 8
+; CHECK-NEXT:    [[MTCE_LINKPTR_0:%.*]] = getelementptr inbounds [[MTCE_LISTTY_1]], %mtce.listty.1* [[MTCE_ITEMPTR]], i64 0, i32 1
+; CHECK-NEXT:    [[MTCE_NEXTPTR_0:%.*]] = getelementptr inbounds [[MTCE_LISTTY_1]], %mtce.listty.1* [[MTCE_ITEMPTR]], i64 0, i32 1, i32 0
+; CHECK-NEXT:    [[MTCE_NEXTPTR_FOR_1:%.*]] = getelementptr inbounds [[MTCE_LISTTY_1]], %mtce.listty.1* [[MTCE_ITEMPTR]], i64 1
+; CHECK-NEXT:    [[MTCE_PTOI_1:%.*]] = ptrtoint %mtce.listty.1* [[MTCE_NEXTPTR_FOR_1]] to i64
+; CHECK-NEXT:    [[MTCE_LISTHEAD_MSKED_INTPTR:%.*]] = or i64 [[MTCE_PTOI_1]], 1
+; CHECK-NEXT:    [[MTCE_TAGGED_NEXT_FOR_1:%.*]] = inttoptr i64 [[MTCE_LISTHEAD_MSKED_INTPTR]] to %mtce.listty.1*
+; CHECK-NEXT:    store %mtce.listty.1* [[MTCE_TAGGED_NEXT_FOR_1]], %mtce.listty.1** [[MTCE_NEXTPTR_0]], align 8
+; CHECK-NEXT:    [[MTCE_LINKPTR_1:%.*]] = getelementptr inbounds [[MTCE_LISTTY_1]], %mtce.listty.1* [[MTCE_ITEMPTR]], i64 1, i32 1
+; CHECK-NEXT:    [[MTCE_NEXTPTR_1:%.*]] = getelementptr inbounds [[MTCE_LISTTY_1]], %mtce.listty.1* [[MTCE_ITEMPTR]], i64 1, i32 1, i32 0
+; CHECK-NEXT:    [[MTCE_NEXTPTR_FOR_2:%.*]] = getelementptr inbounds [[MTCE_LISTTY_1]], %mtce.listty.1* [[MTCE_ITEMPTR]], i64 2
+; CHECK-NEXT:    store %mtce.listty.1* [[MTCE_NEXTPTR_FOR_2]], %mtce.listty.1** [[MTCE_NEXTPTR_1]], align 8
+; CHECK-NEXT:    [[MTCE_LINKPTR_2:%.*]] = getelementptr inbounds [[MTCE_LISTTY_1]], %mtce.listty.1* [[MTCE_ITEMPTR]], i64 2, i32 1
+; CHECK-NEXT:    br label [[MTCE_LOOP_1]]
+; CHECK:       mtce.allocfromfree:
+; CHECK-NEXT:    [[MTCE_PULLEDITEMGEP_0:%.*]] = getelementptr inbounds [[MTCE_LISTTY_1]], %mtce.listty.1* [[MTCE_FREELIST]], i64 0
+; CHECK-NEXT:    [[MTCE_FREELIST_NEXTPTR:%.*]] = getelementptr inbounds [[MTCE_LISTTY_1]], %mtce.listty.1* [[MTCE_FREELIST]], i64 2, i32 1, i32 0
+; CHECK-NEXT:    [[MTCE_NEWFREELIST:%.*]] = load %mtce.listty.1*, %mtce.listty.1** [[MTCE_FREELIST_NEXTPTR]], align 8
+; CHECK-NEXT:    br label [[MTCE_LOOP_1]]
+; CHECK:       mtce.loop.1:
+; CHECK-NEXT:    [[MTCE_ALLOCATED:%.*]] = phi %mtce.listty.1* [ [[MTCE_ITEMPTR]], [[MTCE_ALLOCA]] ], [ [[MTCE_FREELIST]], [[MTCE_ALLOCFROMFREE]] ]
+; CHECK-NEXT:    [[MTCE_FREELIST_NEXT]] = phi %mtce.listty.1* [ [[MTCE_FREELIST]], [[MTCE_ALLOCA]] ], [ [[MTCE_NEWFREELIST]], [[MTCE_ALLOCFROMFREE]] ]
+; CHECK-NEXT:    [[MTCE_NEXTPTR_2:%.*]] = getelementptr inbounds [[MTCE_LISTTY_1]], %mtce.listty.1* [[MTCE_ALLOCATED]], i64 2, i32 1, i32 0
+; CHECK-NEXT:    store %mtce.listty.1* [[MTCE_LISTHEAD]], %mtce.listty.1** [[MTCE_NEXTPTR_2]], align 8
+; CHECK-NEXT:    [[MTCE_ARG_0_OP0:%.*]] = insertvalue { %quadtree_ty* } undef, %quadtree_ty* [[C1]], 0
+; CHECK-NEXT:    [[MTCE_OPERANDSGEP_0:%.*]] = getelementptr inbounds [[MTCE_LISTTY_1]], %mtce.listty.1* [[MTCE_ALLOCATED]], i64 0, i32 0
+; CHECK-NEXT:    store { %quadtree_ty* } [[MTCE_ARG_0_OP0]], { %quadtree_ty* }* [[MTCE_OPERANDSGEP_0]], align 8
+; CHECK-NEXT:    [[MTCE_ARG_1_OP0:%.*]] = insertvalue { %quadtree_ty* } undef, %quadtree_ty* [[C2]], 0
+; CHECK-NEXT:    [[MTCE_OPERANDSGEP_1:%.*]] = getelementptr inbounds [[MTCE_LISTTY_1]], %mtce.listty.1* [[MTCE_ALLOCATED]], i64 1, i32 0
+; CHECK-NEXT:    store { %quadtree_ty* } [[MTCE_ARG_1_OP0]], { %quadtree_ty* }* [[MTCE_OPERANDSGEP_1]], align 8
+; CHECK-NEXT:    [[MTCE_ARG_2_OP0:%.*]] = insertvalue { %quadtree_ty* } undef, %quadtree_ty* [[C3]], 0
+; CHECK-NEXT:    [[MTCE_OPERANDSGEP_2:%.*]] = getelementptr inbounds [[MTCE_LISTTY_1]], %mtce.listty.1* [[MTCE_ALLOCATED]], i64 2, i32 0
+; CHECK-NEXT:    store { %quadtree_ty* } [[MTCE_ARG_2_OP0]], { %quadtree_ty* }* [[MTCE_OPERANDSGEP_2]], align 8
+; CHECK-NEXT:    [[MTCE_NEXTPTR_FOR_0:%.*]] = getelementptr inbounds [[MTCE_LISTTY_1]], %mtce.listty.1* [[MTCE_ALLOCATED]], i64 0
+; CHECK-NEXT:    [[MTCE_PTOI_0:%.*]] = ptrtoint %mtce.listty.1* [[MTCE_NEXTPTR_FOR_0]] to i64
+; CHECK-NEXT:    [[MTCE_LISTHEAD_MSKED_INTPTR1:%.*]] = or i64 [[MTCE_PTOI_0]], 1
+; CHECK-NEXT:    [[MTCE_TAGGED_NEXT_FOR_0]] = inttoptr i64 [[MTCE_LISTHEAD_MSKED_INTPTR1]] to %mtce.listty.1*
+; CHECK-NEXT:    br label [[MTCE_LOOP]]
+; CHECK:       mtce.return:
+; CHECK-NEXT:    [[MTCE_PTOI:%.*]] = ptrtoint %mtce.listty.1* [[MTCE_LISTHEAD]] to i64
+; CHECK-NEXT:    [[MTCE_MARK:%.*]] = trunc i64 [[MTCE_PTOI]] to i1
+; CHECK-NEXT:    [[MTCE_MASKED_WORKLIST_INTPTR:%.*]] = and i64 [[MTCE_PTOI]], -2
+; CHECK-NEXT:    [[MTCE_MASKED_WORKLIST:%.*]] = inttoptr i64 [[MTCE_MASKED_WORKLIST_INTPTR]] to %mtce.listty.1*
+; CHECK-NEXT:    br i1 [[MTCE_MARK]], label [[MTCE_UNMARKED]], label [[MTCE_CHECK_RETURN:%.*]]
+; CHECK:       mtce.unmarked:
+; CHECK-NEXT:    [[MTCE_UMLOAD]] = load [[MTCE_LISTTY_1]], %mtce.listty.1* [[MTCE_MASKED_WORKLIST]], align 8
+; CHECK-NEXT:    br label [[MTCE_LOOP_FROM_WORKLIST]]
+; CHECK:       mtce.check.return:
+; CHECK-NEXT:    [[MTCE_WORKLIST_IS_NULL_TG:%.*]] = icmp eq %mtce.listty.1* [[MTCE_LISTHEAD]], null
+; CHECK-NEXT:    br i1 [[MTCE_WORKLIST_IS_NULL_TG]], label [[MTCE_RETONLY:%.*]], label [[MTCE_ADD_FREELIST]]
+; CHECK:       mtce.add.freelist:
+; CHECK-NEXT:    [[MTCE_MLOAD]] = load [[MTCE_LISTTY_1]], %mtce.listty.1* [[MTCE_LISTHEAD]], align 8
+; CHECK-NEXT:    [[MTCE_FREELIST_ADDED]] = getelementptr inbounds [[MTCE_LISTTY_1]], %mtce.listty.1* [[MTCE_LISTHEAD]], i64 -2
+; CHECK-NEXT:    [[MTCE_FREENEXTPTR:%.*]] = getelementptr inbounds [[MTCE_LISTTY_1]], %mtce.listty.1* [[MTCE_FREELIST_ADDED]], i64 2, i32 1, i32 0
+; CHECK-NEXT:    store %mtce.listty.1* [[MTCE_FREELIST]], %mtce.listty.1** [[MTCE_FREENEXTPTR]], align 8
+; CHECK-NEXT:    br label [[MTCE_LOOP_FROM_WORKLIST]]
+; CHECK:       mtce.retonly:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c0ptr = getelementptr inbounds %quadtree_ty, %quadtree_ty* %this, i64 0, i32 0, i32 0
+  %c0 = load %quadtree_ty*, %quadtree_ty** %c0ptr
+  %c1ptr = getelementptr inbounds %quadtree_ty, %quadtree_ty* %this, i64 0, i32 0, i32 1
+  %c1 = load %quadtree_ty*, %quadtree_ty** %c1ptr
+  %c2ptr = getelementptr inbounds %quadtree_ty, %quadtree_ty* %this, i64 0, i32 0, i32 2
+  %c2 = load %quadtree_ty*, %quadtree_ty** %c2ptr
+  %c3ptr = getelementptr inbounds %quadtree_ty, %quadtree_ty* %this, i64 0, i32 0, i32 3
+  %c3 = load %quadtree_ty*, %quadtree_ty** %c3ptr
+  call void @quadtree(%quadtree_ty* %c0, i32 %a2, i32 %a3)
+  call void @quadtree(%quadtree_ty* %c1, i32 %a2, i32 %a3)
+  call void @quadtree(%quadtree_ty* %c2, i32 %a2, i32 %a3)
+  call void @quadtree(%quadtree_ty* %c3, i32 %a2, i32 %a3)
+  ret void
+}
+
+; Traverse a full quad-tree where the load is executed as part of function H
+define void @quadtree_H(%quadtree_ty* %this, i32 %a2, i32 %a3) {
+; CHECK-LABEL: @quadtree_H(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[MTCE_LOOP:%.*]]
+; CHECK:       mtce.loop.from.worklist:
+; CHECK-NEXT:    [[MTCE_WORKLIST_ITEM:%.*]] = phi [[MTCE_LISTTY_2:%.*]] [ [[MTCE_UMLOAD:%.*]], [[MTCE_UNMARKED:%.*]] ], [ [[MTCE_MLOAD:%.*]], [[MTCE_ADD_FREELIST:%.*]] ]
+; CHECK-NEXT:    [[MTCE_FREELIST_NEXT2:%.*]] = phi %mtce.listty.2* [ [[MTCE_FREELIST:%.*]], [[MTCE_UNMARKED]] ], [ [[MTCE_FREELIST_ADDED:%.*]], [[MTCE_ADD_FREELIST]] ]
+; CHECK-NEXT:    [[MTCE_WLNEXT:%.*]] = extractvalue [[MTCE_LISTTY_2]] %mtce.worklist.item, 1, 0
+; CHECK-NEXT:    [[MTCE_OP_0:%.*]] = extractvalue [[MTCE_LISTTY_2]] %mtce.worklist.item, 0, 0
+; CHECK-NEXT:    [[TMP0:%.*]] = load %quadtree_ty*, %quadtree_ty** [[MTCE_OP_0]]
+; CHECK-NEXT:    br label [[MTCE_LOOP]]
+; CHECK:       mtce.loop:
+; CHECK-NEXT:    [[MTCE_LISTHEAD:%.*]] = phi %mtce.listty.2* [ null, [[ENTRY:%.*]] ], [ [[MTCE_TAGGED_NEXT_FOR_0:%.*]], [[MTCE_LOOP_1:%.*]] ], [ [[MTCE_WLNEXT]], [[MTCE_LOOP_FROM_WORKLIST:%.*]] ]
+; CHECK-NEXT:    [[MTCE_FREELIST]] = phi %mtce.listty.2* [ null, [[ENTRY]] ], [ [[MTCE_FREELIST_NEXT:%.*]], [[MTCE_LOOP_1]] ], [ [[MTCE_FREELIST_NEXT2]], [[MTCE_LOOP_FROM_WORKLIST]] ]
+; CHECK-NEXT:    [[MTCE_THIS:%.*]] = phi %quadtree_ty* [ [[THIS:%.*]], [[ENTRY]] ], [ [[C0:%.*]], [[MTCE_LOOP_1]] ], [ [[TMP0]], [[MTCE_LOOP_FROM_WORKLIST]] ]
+; CHECK-NEXT:    [[C0PTR:%.*]] = getelementptr inbounds [[QUADTREE_TY:%.*]], %quadtree_ty* [[MTCE_THIS]], i64 0, i32 0, i32 0
+; CHECK-NEXT:    [[C0]] = load %quadtree_ty*, %quadtree_ty** [[C0PTR]]
+; CHECK-NEXT:    [[C1PTR:%.*]] = getelementptr inbounds [[QUADTREE_TY]], %quadtree_ty* [[MTCE_THIS]], i64 0, i32 0, i32 1
+; CHECK-NEXT:    [[C2PTR:%.*]] = getelementptr inbounds [[QUADTREE_TY]], %quadtree_ty* [[MTCE_THIS]], i64 0, i32 0, i32 2
+; CHECK-NEXT:    [[C3PTR:%.*]] = getelementptr inbounds [[QUADTREE_TY]], %quadtree_ty* [[MTCE_THIS]], i64 0, i32 0, i32 3
+; CHECK-NEXT:    [[MTCE_CMP_FREELIST_EMPTY:%.*]] = icmp eq %mtce.listty.2* [[MTCE_FREELIST]], null
+; CHECK-NEXT:    br i1 [[MTCE_CMP_FREELIST_EMPTY]], label [[MTCE_ALLOCA:%.*]], label [[MTCE_ALLOCFROMFREE:%.*]]
+; CHECK:       mtce.alloca:
+; CHECK-NEXT:    [[MTCE_ITEMPTR:%.*]] = alloca [[MTCE_LISTTY_2]], i64 3, align 8
+; CHECK-NEXT:    [[MTCE_LINKPTR_0:%.*]] = getelementptr inbounds [[MTCE_LISTTY_2]], %mtce.listty.2* [[MTCE_ITEMPTR]], i64 0, i32 1
+; CHECK-NEXT:    [[MTCE_NEXTPTR_0:%.*]] = getelementptr inbounds [[MTCE_LISTTY_2]], %mtce.listty.2* [[MTCE_ITEMPTR]], i64 0, i32 1, i32 0
+; CHECK-NEXT:    [[MTCE_NEXTPTR_FOR_1:%.*]] = getelementptr inbounds [[MTCE_LISTTY_2]], %mtce.listty.2* [[MTCE_ITEMPTR]], i64 1
+; CHECK-NEXT:    [[MTCE_PTOI_1:%.*]] = ptrtoint %mtce.listty.2* [[MTCE_NEXTPTR_FOR_1]] to i64
+; CHECK-NEXT:    [[MTCE_LISTHEAD_MSKED_INTPTR:%.*]] = or i64 [[MTCE_PTOI_1]], 1
+; CHECK-NEXT:    [[MTCE_TAGGED_NEXT_FOR_1:%.*]] = inttoptr i64 [[MTCE_LISTHEAD_MSKED_INTPTR]] to %mtce.listty.2*
+; CHECK-NEXT:    store %mtce.listty.2* [[MTCE_TAGGED_NEXT_FOR_1]], %mtce.listty.2** [[MTCE_NEXTPTR_0]], align 8
+; CHECK-NEXT:    [[MTCE_LINKPTR_1:%.*]] = getelementptr inbounds [[MTCE_LISTTY_2]], %mtce.listty.2* [[MTCE_ITEMPTR]], i64 1, i32 1
+; CHECK-NEXT:    [[MTCE_NEXTPTR_1:%.*]] = getelementptr inbounds [[MTCE_LISTTY_2]], %mtce.listty.2* [[MTCE_ITEMPTR]], i64 1, i32 1, i32 0
+; CHECK-NEXT:    [[MTCE_NEXTPTR_FOR_2:%.*]] = getelementptr inbounds [[MTCE_LISTTY_2]], %mtce.listty.2* [[MTCE_ITEMPTR]], i64 2
+; CHECK-NEXT:    store %mtce.listty.2* [[MTCE_NEXTPTR_FOR_2]], %mtce.listty.2** [[MTCE_NEXTPTR_1]], align 8
+; CHECK-NEXT:    [[MTCE_LINKPTR_2:%.*]] = getelementptr inbounds [[MTCE_LISTTY_2]], %mtce.listty.2* [[MTCE_ITEMPTR]], i64 2, i32 1
+; CHECK-NEXT:    br label [[MTCE_LOOP_1]]
+; CHECK:       mtce.allocfromfree:
+; CHECK-NEXT:    [[MTCE_PULLEDITEMGEP_0:%.*]] = getelementptr inbounds [[MTCE_LISTTY_2]], %mtce.listty.2* [[MTCE_FREELIST]], i64 0
+; CHECK-NEXT:    [[MTCE_FREELIST_NEXTPTR:%.*]] = getelementptr inbounds [[MTCE_LISTTY_2]], %mtce.listty.2* [[MTCE_FREELIST]], i64 2, i32 1, i32 0
+; CHECK-NEXT:    [[MTCE_NEWFREELIST:%.*]] = load %mtce.listty.2*, %mtce.listty.2** [[MTCE_FREELIST_NEXTPTR]], align 8
+; CHECK-NEXT:    br label [[MTCE_LOOP_1]]
+; CHECK:       mtce.loop.1:
+; CHECK-NEXT:    [[MTCE_ALLOCATED:%.*]] = phi %mtce.listty.2* [ [[MTCE_ITEMPTR]], [[MTCE_ALLOCA]] ], [ [[MTCE_FREELIST]], [[MTCE_ALLOCFROMFREE]] ]
+; CHECK-NEXT:    [[MTCE_FREELIST_NEXT]] = phi %mtce.listty.2* [ [[MTCE_FREELIST]], [[MTCE_ALLOCA]] ], [ [[MTCE_NEWFREELIST]], [[MTCE_ALLOCFROMFREE]] ]
+; CHECK-NEXT:    [[MTCE_NEXTPTR_2:%.*]] = getelementptr inbounds [[MTCE_LISTTY_2]], %mtce.listty.2* [[MTCE_ALLOCATED]], i64 2, i32 1, i32 0
+; CHECK-NEXT:    store %mtce.listty.2* [[MTCE_LISTHEAD]], %mtce.listty.2** [[MTCE_NEXTPTR_2]], align 8
+; CHECK-NEXT:    [[MTCE_ARG_0_OP0:%.*]] = insertvalue { %quadtree_ty** } undef, %quadtree_ty** [[C1PTR]], 0
+; CHECK-NEXT:    [[MTCE_OPERANDSGEP_0:%.*]] = getelementptr inbounds [[MTCE_LISTTY_2]], %mtce.listty.2* [[MTCE_ALLOCATED]], i64 0, i32 0
+; CHECK-NEXT:    store { %quadtree_ty** } [[MTCE_ARG_0_OP0]], { %quadtree_ty** }* [[MTCE_OPERANDSGEP_0]], align 8
+; CHECK-NEXT:    [[MTCE_ARG_1_OP0:%.*]] = insertvalue { %quadtree_ty** } undef, %quadtree_ty** [[C2PTR]], 0
+; CHECK-NEXT:    [[MTCE_OPERANDSGEP_1:%.*]] = getelementptr inbounds [[MTCE_LISTTY_2]], %mtce.listty.2* [[MTCE_ALLOCATED]], i64 1, i32 0
+; CHECK-NEXT:    store { %quadtree_ty** } [[MTCE_ARG_1_OP0]], { %quadtree_ty** }* [[MTCE_OPERANDSGEP_1]], align 8
+; CHECK-NEXT:    [[MTCE_ARG_2_OP0:%.*]] = insertvalue { %quadtree_ty** } undef, %quadtree_ty** [[C3PTR]], 0
+; CHECK-NEXT:    [[MTCE_OPERANDSGEP_2:%.*]] = getelementptr inbounds [[MTCE_LISTTY_2]], %mtce.listty.2* [[MTCE_ALLOCATED]], i64 2, i32 0
+; CHECK-NEXT:    store { %quadtree_ty** } [[MTCE_ARG_2_OP0]], { %quadtree_ty** }* [[MTCE_OPERANDSGEP_2]], align 8
+; CHECK-NEXT:    [[MTCE_NEXTPTR_FOR_0:%.*]] = getelementptr inbounds [[MTCE_LISTTY_2]], %mtce.listty.2* [[MTCE_ALLOCATED]], i64 0
+; CHECK-NEXT:    [[MTCE_PTOI_0:%.*]] = ptrtoint %mtce.listty.2* [[MTCE_NEXTPTR_FOR_0]] to i64
+; CHECK-NEXT:    [[MTCE_LISTHEAD_MSKED_INTPTR1:%.*]] = or i64 [[MTCE_PTOI_0]], 1
+; CHECK-NEXT:    [[MTCE_TAGGED_NEXT_FOR_0]] = inttoptr i64 [[MTCE_LISTHEAD_MSKED_INTPTR1]] to %mtce.listty.2*
+; CHECK-NEXT:    br label [[MTCE_LOOP]]
+; CHECK:       mtce.return:
+; CHECK-NEXT:    [[MTCE_PTOI:%.*]] = ptrtoint %mtce.listty.2* [[MTCE_LISTHEAD]] to i64
+; CHECK-NEXT:    [[MTCE_MARK:%.*]] = trunc i64 [[MTCE_PTOI]] to i1
+; CHECK-NEXT:    [[MTCE_MASKED_WORKLIST_INTPTR:%.*]] = and i64 [[MTCE_PTOI]], -2
+; CHECK-NEXT:    [[MTCE_MASKED_WORKLIST:%.*]] = inttoptr i64 [[MTCE_MASKED_WORKLIST_INTPTR]] to %mtce.listty.2*
+; CHECK-NEXT:    br i1 [[MTCE_MARK]], label [[MTCE_UNMARKED]], label [[MTCE_CHECK_RETURN:%.*]]
+; CHECK:       mtce.unmarked:
+; CHECK-NEXT:    [[MTCE_UMLOAD]] = load [[MTCE_LISTTY_2]], %mtce.listty.2* [[MTCE_MASKED_WORKLIST]], align 8
+; CHECK-NEXT:    br label [[MTCE_LOOP_FROM_WORKLIST]]
+; CHECK:       mtce.check.return:
+; CHECK-NEXT:    [[MTCE_WORKLIST_IS_NULL_TG:%.*]] = icmp eq %mtce.listty.2* [[MTCE_LISTHEAD]], null
+; CHECK-NEXT:    br i1 [[MTCE_WORKLIST_IS_NULL_TG]], label [[MTCE_RETONLY:%.*]], label [[MTCE_ADD_FREELIST]]
+; CHECK:       mtce.add.freelist:
+; CHECK-NEXT:    [[MTCE_MLOAD]] = load [[MTCE_LISTTY_2]], %mtce.listty.2* [[MTCE_LISTHEAD]], align 8
+; CHECK-NEXT:    [[MTCE_FREELIST_ADDED]] = getelementptr inbounds [[MTCE_LISTTY_2]], %mtce.listty.2* [[MTCE_LISTHEAD]], i64 -2
+; CHECK-NEXT:    [[MTCE_FREENEXTPTR:%.*]] = getelementptr inbounds [[MTCE_LISTTY_2]], %mtce.listty.2* [[MTCE_FREELIST_ADDED]], i64 2, i32 1, i32 0
+; CHECK-NEXT:    store %mtce.listty.2* [[MTCE_FREELIST]], %mtce.listty.2** [[MTCE_FREENEXTPTR]], align 8
+; CHECK-NEXT:    br label [[MTCE_LOOP_FROM_WORKLIST]]
+; CHECK:       mtce.retonly:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c0ptr = getelementptr inbounds %quadtree_ty, %quadtree_ty* %this, i64 0, i32 0, i32 0
+  %c0 = load %quadtree_ty*, %quadtree_ty** %c0ptr
+  call void @quadtree_H(%quadtree_ty* %c0, i32 %a2, i32 %a3)
+  %c1ptr = getelementptr inbounds %quadtree_ty, %quadtree_ty* %this, i64 0, i32 0, i32 1
+  %c1 = load %quadtree_ty*, %quadtree_ty** %c1ptr
+  call void @quadtree_H(%quadtree_ty* %c1, i32 %a2, i32 %a3)
+  %c2ptr = getelementptr inbounds %quadtree_ty, %quadtree_ty* %this, i64 0, i32 0, i32 2
+  %c2 = load %quadtree_ty*, %quadtree_ty** %c2ptr
+  call void @quadtree_H(%quadtree_ty* %c2, i32 %a2, i32 %a3)
+  %c3ptr = getelementptr inbounds %quadtree_ty, %quadtree_ty* %this, i64 0, i32 0, i32 3
+  %c3 = load %quadtree_ty*, %quadtree_ty** %c3ptr
+  call void @quadtree_H(%quadtree_ty* %c3, i32 %a2, i32 %a3)
+  ret void
+}
+
+; Traverse a full quad-tree where function H is different for one call
+define void @quadtree_invalH(%quadtree_ty* %this, i32 %a2, i32 %a3) {
+; CHECK-LABEL: @quadtree_invalH(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[C0PTR:%.*]] = getelementptr inbounds [[QUADTREE_TY:%.*]], %quadtree_ty* [[THIS:%.*]], i64 0, i32 0, i32 0
+; CHECK-NEXT:    [[C0:%.*]] = load %quadtree_ty*, %quadtree_ty** [[C0PTR]]
+; CHECK-NEXT:    [[C1PTR:%.*]] = getelementptr inbounds [[QUADTREE_TY]], %quadtree_ty* [[THIS]], i64 0, i32 0, i32 1
+; CHECK-NEXT:    [[C1:%.*]] = load %quadtree_ty*, %quadtree_ty** [[C1PTR]]
+; CHECK-NEXT:    call void @quadtree_invalH(%quadtree_ty* [[C0]], i32 [[A2:%.*]], i32 [[A3:%.*]])
+; CHECK-NEXT:    call void @quadtree_invalH(%quadtree_ty* [[C1]], i32 [[A2]], i32 [[A3]])
+; CHECK-NEXT:    [[C2PTR:%.*]] = getelementptr inbounds [[QUADTREE_TY]], %quadtree_ty* [[THIS]], i64 0, i32 0, i32 2
+; CHECK-NEXT:    [[C2:%.*]] = load %quadtree_ty*, %quadtree_ty** [[C2PTR]]
+; CHECK-NEXT:    call void @quadtree_invalH(%quadtree_ty* [[C2]], i32 [[A2]], i32 [[A3]])
+; CHECK-NEXT:    [[C3PTR:%.*]] = getelementptr inbounds [[QUADTREE_TY]], %quadtree_ty* [[THIS]], i64 0, i32 0, i32 3
+; CHECK-NEXT:    [[C3:%.*]] = load %quadtree_ty*, %quadtree_ty** [[C3PTR]]
+; CHECK-NEXT:    call void @quadtree_invalH(%quadtree_ty* [[C3]], i32 [[A2]], i32 [[A3]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c0ptr = getelementptr inbounds %quadtree_ty, %quadtree_ty* %this, i64 0, i32 0, i32 0
+  %c0 = load %quadtree_ty*, %quadtree_ty** %c0ptr
+  %c1ptr = getelementptr inbounds %quadtree_ty, %quadtree_ty* %this, i64 0, i32 0, i32 1
+  %c1 = load %quadtree_ty*, %quadtree_ty** %c1ptr
+  call void @quadtree_invalH(%quadtree_ty* %c0, i32 %a2, i32 %a3)
+  call void @quadtree_invalH(%quadtree_ty* %c1, i32 %a2, i32 %a3)
+  %c2ptr = getelementptr inbounds %quadtree_ty, %quadtree_ty* %this, i64 0, i32 0, i32 2
+  %c2 = load %quadtree_ty*, %quadtree_ty** %c2ptr
+  call void @quadtree_invalH(%quadtree_ty* %c2, i32 %a2, i32 %a3)
+  %c3ptr = getelementptr inbounds %quadtree_ty, %quadtree_ty* %this, i64 0, i32 0, i32 3
+  %c3 = load %quadtree_ty*, %quadtree_ty** %c3ptr
+  call void @quadtree_invalH(%quadtree_ty* %c3, i32 %a2, i32 %a3)
+  ret void
+}
Index: test/Transforms/MultiTailCallElimination/lit.local.cfg
===================================================================
--- /dev/null
+++ test/Transforms/MultiTailCallElimination/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'AArch64' in config.root.targets:
+    config.unsupported = True