Index: include/llvm-c/Transforms/Scalar.h =================================================================== --- include/llvm-c/Transforms/Scalar.h +++ include/llvm-c/Transforms/Scalar.h @@ -123,6 +123,9 @@ /** See llvm::createTailCallEliminationPass function. */ void LLVMAddTailCallEliminationPass(LLVMPassManagerRef PM); +/** See llvm::createMultiTailCallEliminationPass function. */ +void LLVMAddMultiTailCallEliminationPass(LLVMPassManagerRef PM); + /** See llvm::createConstantPropagationPass function. */ void LLVMAddConstantPropagationPass(LLVMPassManagerRef PM); Index: include/llvm/InitializePasses.h =================================================================== --- include/llvm/InitializePasses.h +++ include/llvm/InitializePasses.h @@ -385,6 +385,7 @@ void initializeStripSymbolsPass(PassRegistry&); void initializeStructurizeCFGPass(PassRegistry&); void initializeTailCallElimPass(PassRegistry&); +void initializeMultiTailCallEliminationPass(PassRegistry&); void initializeTailDuplicatePass(PassRegistry&); void initializeTargetLibraryInfoWrapperPassPass(PassRegistry&); void initializeTargetPassConfigPass(PassRegistry&); Index: include/llvm/Transforms/Scalar.h =================================================================== --- include/llvm/Transforms/Scalar.h +++ include/llvm/Transforms/Scalar.h @@ -280,6 +280,12 @@ //===----------------------------------------------------------------------===// // +// MultiTailCallElimination - This pass eliminates multiple tail calls. +// +FunctionPass *createMultiTailCallEliminationPass(); + +//===----------------------------------------------------------------------===// +// // EarlyCSE - This pass performs a simple and fast CSE pass over the dominator // tree. // Index: include/llvm/Transforms/Scalar/MultiTailCallElimination.h =================================================================== --- /dev/null +++ include/llvm/Transforms/Scalar/MultiTailCallElimination.h @@ -0,0 +1,30 @@ +//===---- MultiTailCallElimination.h ----------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass converts multiple tail recursive calls into a loop, by modeling the +// calls as a single linked worklist explicitly on the stack. +// +// See MultiTailCallElimination.cpp for more details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_SCALAR_RECURSIONSTACKELIMINATION_H +#define LLVM_TRANSFORMS_SCALAR_RECURSIONSTACKELIMINATION_H + +#include "llvm/IR/Function.h" +#include "llvm/IR/PassManager.h" + +namespace llvm { + +struct MultiTailCallEliminationPass : PassInfoMixin { + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; +} + +#endif // LLVM_TRANSFORMS_SCALAR_RECURSIONSTACKELIMINATION_H Index: lib/Passes/PassBuilder.cpp =================================================================== --- lib/Passes/PassBuilder.cpp +++ lib/Passes/PassBuilder.cpp @@ -133,6 +133,7 @@ #include "llvm/Transforms/Scalar/LowerGuardIntrinsic.h" #include "llvm/Transforms/Scalar/MemCpyOptimizer.h" #include "llvm/Transforms/Scalar/MergedLoadStoreMotion.h" +#include "llvm/Transforms/Scalar/MultiTailCallElimination.h" #include "llvm/Transforms/Scalar/NaryReassociate.h" #include "llvm/Transforms/Scalar/NewGVN.h" #include "llvm/Transforms/Scalar/PartiallyInlineLibCalls.h" @@ -359,6 +360,14 @@ assert(Level != O0 && "Must request optimizations!"); FunctionPassManager FPM(DebugLogging); + if (Level == O3) { + FPM.addPass(SROA()); // Try to get rid of allocas + FPM.addPass(MultiTailCallEliminationPass()); + FPM.addPass(VerifierPass()); + FPM.addPass(ADCEPass()); // Delete dead instructions + FPM.addPass(SimplifyCFGPass()); // Merge & remove BBs + } + // Form SSA out of local memory accesses after breaking apart aggregates into // scalars. FPM.addPass(SROA()); Index: lib/Transforms/IPO/PassManagerBuilder.cpp =================================================================== --- lib/Transforms/IPO/PassManagerBuilder.cpp +++ lib/Transforms/IPO/PassManagerBuilder.cpp @@ -543,6 +543,15 @@ MPM.add(createArgumentPromotionPass()); // Scalarize uninlined fn args addExtensionsToPM(EP_CGSCCOptimizerLate, MPM); + + if ((SizeLevel == 0) && (OptLevel > 2)) { + MPM.add(createSROAPass()); + MPM.add(createCFGSimplificationPass()); // Merge & remove BBs + MPM.add(createMultiTailCallEliminationPass()); + MPM.add(createVerifierPass()); + MPM.add(createAggressiveDCEPass()); // Delete dead instructions + } + addFunctionSimplificationPasses(MPM); // FIXME: This is a HACK! The inliner pass above implicitly creates a CGSCC Index: lib/Transforms/Scalar/CMakeLists.txt =================================================================== --- lib/Transforms/Scalar/CMakeLists.txt +++ lib/Transforms/Scalar/CMakeLists.txt @@ -48,6 +48,7 @@ MemCpyOptimizer.cpp MergeICmps.cpp MergedLoadStoreMotion.cpp + MultiTailCallElimination.cpp NaryReassociate.cpp NewGVN.cpp PartiallyInlineLibCalls.cpp Index: lib/Transforms/Scalar/MultiTailCallElimination.cpp =================================================================== --- /dev/null +++ lib/Transforms/Scalar/MultiTailCallElimination.cpp @@ -0,0 +1,2472 @@ +//===- MultiTailCallElimination.cpp - Eliminate multiple tail calls -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass converts multiple tail recursive calls into a loop, by modeling the +// calls as a single linked worklist explicitly on the stack. +// +// void f(a, b, const c) { +// ... +// if (...) +// return +// +// a1, b1, b2, b3, x2, x3 = ... +// +// f(a1, b1, c) // 1st recursion +// a2 = h(x2) +// f(a2, b2, c) // 2nd recursion +// a3 = h(x3) +// f(a3, b3, c) // 3rd recursion +// } +// +// -> +// +// void f(a, b, const c) { +// struct { x, b, next } *worklist = null +// +// loop: +// ... +// if (...) +// goto check_return +// +// a1, b1, b2, b3, x2, x3 = ... +// +// /* Assign arguments for the first call */ +// +// a = a1 +// b = b1 +// +// /* Put arguments of the remaining calls into the work list */ +// +// queue = alloca(2 * sizeof(*worklist)) +// queue[0] = {x2, b2, &queue[1]} +// queue[1] = {x3, b3, worklist} +// worklist = queue +// +// goto loop +// +// check_return: +// if (!worklist) +// return +// a = h(worklist->x) +// b = worklist->b +// worklist = worklist->next +// +// goto loop +// } +// +// Such patterns occur for example when an application traverses k-ary full trees. +// +// The benefits of this transformation is, that neither frame nor link address +// have to be stored on the stack. Also pass through arguments, like 'const c' +// in example above, are less likely required to saved onto the stack, and if so +// less often (only once in the entry block). +// +// The downsides are: +// +// a) An additional store for the worklist is required. +// +// b) The worst case required stack memory after this transformation depends on +// the number of recursive calls, instead of the recursion depth. +// +// c) Additional conditional branches +// +// ad a) This point is compensated by avoiding storing the link address in each +// call. +// +// ad b) This pass additionally can add (and does it by default) code to manage +// unused blocks of allocas in a freelist. Because allocations are done in +// blocks the freelist management, can also be done in blocks. This requires +// the last item in each block to be marked, detectable by the code within +// return. +// +// Free list management introduces the following code: +// +// 1) Upon entry a second variable is added just like the worklist. +// +// struct { x, b, next } *freelist = null +// +// 2) The alloca instruction is surrounded by a conditional +// +// if (freelist != nullptr) { +// queue = freelist +// freelist = freelist[].next; +// } +// else { +// queue = alloca(2 * sizeof(*worklist)) +// } +// +// 3) The return path in check return is modified dependent on the marker +// algorithm (see below) +// +// Currently the following mark algorithms are implemented: +// +// A further variable is stored on the stack containing the current +// freelist. On the other hand by not executing a recursion the frame pointer +// loads and stores are omitted. +// +// * TrueMarker: This algorithm marks each element and is applicable to binary +// recursions, such as quick sort. +// +// The following return path is generated: +// +// check_return: +// if (!worklist) +// return +// +// struct { x, b, next } nextitem = *worklist +// worklist[] = freelist +// freelist = worklist +// +// a = h(nextitem.x) +// b = nextitem.b +// worklist = nextitem.next +// +// goto loop +// +// * TaggedMarker: (not yet implemented) Similar to the FieldMarker this +// marker marks the last item by a bit. But instead of using a separate bit +// it uses Bit 0 of the worklist field. If the alignment of the worklist is +// a power of 2, and if it is >= 2, this marker can also cover the general +// case. It requires some additional bit masking but no additional memory +// operations. +// +// The following return path is generated: +// +// check_return: +// struct { x, b, next } nextitem; +// +// if (bit_0_is_set(worklist)) { +// nextitem = *mask_bit_0(worklist) +// goto pre_loop +// } +// +// if (!worklist) +// return +// +// nextitem = *worklist +// worklist[] = freelist +// freelist = worklist +// +// pre_loop: +// a = h(nextitem.x) +// b = nextitem.b +// worklist = nextitem.next +// goto loop +// +// * FalseMarker: This algorithm marks no element, and is applicable when +// freelist management is disabled. Using this marker implicitly unmarks all +// items. This leads to the same code as described in the introduction. +// +// Additionally the following Marker algorithms have been tested but have been +// dropped. +// +// * FieldMarker: A separate bit is allocated in the worklist. This marker +// requires additional instructions but can be used in the general case. +// This marker is inferior to the tagged marker. +// +// * CompareMarker: Assuming allocas return memory addresses in a strictly +// monotonic order. The freelist can be modeled to return the same order +// when pulling elements from it. Comparing each worklist with worklist.next +// can then reveal the information if the element is marked. The performance +// is similar to the tagged marker but it requires an order for the returned +// pointers of alloca. LLVM does not guarantee this order in general. +// +// ad c) The pass adds 1 conditional branch into the return path and 1 +// additional branches for freelist management (see (b) above). Depending on +// the target, machine branch prediction can elevate this. +// +// List organization +// ----------------- +// +// Suppose a recursion the calls itself 4 times. Fig. 1 depicts a possible list +// state when executing the Step 3.3. Column 1 is a representation of the +// parameters. Column 2 denotes the information whether an item is marked and +// the third column shows the link information of the next pointer. +// +// Worklist always points the the first item to execute after the current +// item. One see that worklist already points the Step 3.4 while executing Step +// 3.3. +// +// Fig. 1: Work-List and free-List while executing Step 3.3. Note that work-list +// already points to Step 3.4 even when currently executing Step 3.3. +// +// +----------+---+-----+ +// | Step 2 | | * | +// +----------+---+ | --+ +----------+---+-----+ +// | Step 3 | | v * | | Step 3.2 | | * | +// +----------+---+-- | | +----------+---+ | --+ +// | Step 4 | M | * v | <-\ | Step 3.3 | | v * | [currently executing] +// +----------+---+-|---+ | +----------+---+-- | + +// v | | Step 3.4 | M | * v | <- [worklist] +// nullptr | +----------+---+ | --+ +// | | +// \--------------------/ +// +// +----------+---+-----+ +// | | | * | <- [freelist] +// +----------+---+ | --+ +// | | | v * | +// +----------+---+-- | + +// | | M | * v | +// +----------+---+ | --+ +// v +// nullptr +// +// The state while executing Step 3.4 might look like depicted in Fig. 2. The +// transformation of the state is done within the return path. +// +// Fig. 2: Work-list and free-list while executing Step 3.4. +// +// +----------+---+-----+ +// | Step 2 | | * | +// +----------+---+ | --+ +----------+---+-----+ +// | Step 3 | | v * | | Step 3.2 | | * | <- [free-list] +// +----------+---+-- | | +----------+---+ | --+ +// | Step 4 | M | * v | <-\ | Step 3.3 | | v * | +// +----------+---+-|---+ | +----------+---+-- | + +// v | | Step 3.4 | M | * v | [currently executing] +// nullptr | +----------+---+ | --+ +// | | +// \--- [work-list] \-\ +// | +// +----------+---+-- | + +// | | | * v | +// +----------+---+ | --+ +// | | | v * | +// +----------+---+-- | + +// | | M | * v | +// +----------+---+ | --+ +// v +// nullptr +// +// Algorithm outline: +// ------------------ +// +// Analysis Phase: +// +// 1) Analyze the function and gather the returning basic blocks (BB) (and BB +// branching to return-only BB) with recursive calls (RC). +// +// 2) If more the one BB or no RC is found abandon the transformation. +// +// 3) If the remaining BB has only one RC abandon the transformation. Otherwise +// let N be number of RC in this BB. +// +// 4) Analyze the instructions in this BB and from the first RC until the +// terminator instruction and classify each instruction as movable and static. +// +// A movable instruction is and instruction that can be safely moved before +// the first RC. All other instructions are classified static. +// +// 5) Assign each static instruction to the following RC instruction. If static +// instructions are left after the last RC abandon the transformation. +// +// 6) Build the function H with all its arguments for each RC. By including the +// call itself in function H it is ensured that this function and enforcing +// this function to return void. It is ensured that there are no escaping +// values uses after the recursion. +// +// 6.1) Note: By the way step 4 is executed it is guaranteed that function H for +// the first RC consists of a single instruction; the call itself. The first +// call candidate is handled special (the same way as in Tail Recursion +// Elimination (TRE)). +// +// 5,6) Note: The information collected on each RC is collected in the structure +// RecursiveCall. +// +// 7) Compare the second's function H with all later ones. The behavior must +// match, otherwise abandon the transformation. +// +// 7.1) Note: As the first RCs function H basically a TRE it can be ignored in this +// step. +// +// 8) Simplify the argument list by removing constants and pass through arguments. +// +// 9) Decide whether it is profitable to use the transformation. +// +// Transformation Phase: +// +// 1) Adjust entry block and split of the loop entry. +// +// 2) Eliminate the first RC (similar to TRE). +// +// 3) Eliminate the remaining RC by allocating and filling an array new (or pick +// it from the freelist) block of N-1 struct items. This array is put in the +// front of the list. Pulling the list is added in (4). The execution of +// function H is ensured in (5). +// +// 4) Create a new return block which pulls items from the worklist. If an and +// of block marker is reached. The block is put into the freelist. +// +// 4.1) Add the instruction from function H into the return block and create the +// loop. +// +// 5) Redirect each returning block into the new return block created in (4). +// +// Open issues and known TODOs - It would be great if reviewer could comment on +// those as well: +// +// 1) Pipeline integration: Currently the pass is put before TRE. This includes +// some supporting passes, for cleanup and preparation. This cannot be left +// as is. The preferred way could be by adjusting +// "AArch64TargetMachine::adjustPassManager" and only use it with +// O3. AAarch64 is selected, because this is the architecture the author has +// suitable benchmarking and test setups available. This was tried once (in a +// similar way as in AMDGPUTargetMachine::adjustPassManager), however the +// result was that may LLVM projects did not compile anymore because of +// linker problems (Passes library was missing). Do you have any advise here? +// +// 2) The way to test if it profitable to use the pass needs adjustment. I think +// that functions, that spill more registers have an increased chance to +// profit, while functions that spill less, have lower chance for profit. +// +// 4) Is it safe to temporary create function and return no-changes after +// deleting it? If not is there a better way to than calling using the +// FunctionComparator? +// +// 5) GlobalsAA needs to be preserved. Not sure about this in this context. Loads +// and Stores are added here. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar/MultiTailCallElimination.h" +#include "llvm/Pass.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/Loads.h" +#include "llvm/Analysis/MemorySSA.h" +#include "llvm/Analysis/MemorySSAUpdater.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/TargetFrameLowering.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/FunctionComparator.h" + +#include +#include + +using namespace llvm; + +#define DEBUG_TYPE "multi-tce" + +namespace { + +/// Option to disable the pass. +static cl::opt +DisableMultiTailCallElimination("disable-" DEBUG_TYPE, cl::init(false), + cl::Hidden, cl::desc("Disable multi tail call eliminiation")); + +/// Option to disable freelist handling. +static cl::opt +DisableFreeList("disable-" DEBUG_TYPE "-freelist", cl::init(false), + cl::Hidden, cl::desc("Disable freelists in multi tail call eliminiation")); + +/// Local implementation class. +struct MultiTailCallEliminationImpl { + MultiTailCallEliminationImpl( + Function &F, AliasAnalysis &AA, TargetTransformInfo &TTI, + OptimizationRemarkEmitter &ORE) : + DL(F.getParent()->getDataLayout()), + C(F.getParent()->getContext()), + TTI(TTI), + ORE(ORE), + F(F), + AA(AA) {} + + bool run() { + // Check if feasible and profitable, + if (analyzeFunction()) { + // then eliminate the tail calls. + eliminateTailCalls(); + return true; + } + + return false; + } + +private: + /// Helper class to collect information on a single recursive call site. This + /// class also stores the required properties of the function H (see + /// introduction's description of this pass). + struct RecursiveCall { + private: + /// Hide copy constructor + RecursiveCall(const RecursiveCall &O) : Index(O.Index), F(O.F), H(O.H), CI(O.CI), Is() { + llvm_unreachable("RecursiveCall cannot be copied"); + } + + public: + RecursiveCall(CallInst *CI, size_t Index) : Index(Index), F(*CI->getParent()->getParent()), H(nullptr), CI(CI), Is() {} + + ~RecursiveCall() { + // If function H was already created, delete it. It is only used temporary. + if (H) + H->eraseFromParent(); + } + + /// Add an instruction to the list representing the function H. + void addInstruction(Instruction *I) { + Is.push_back(I); + } + + /// Returns the recursive call instruction. + CallInst *getCallInst() const { + return CI; + } + + /// Test if the value V is part of function H. Because H is not created these + /// instructions are denoted "local" + bool isLocalValue(Value *V, unsigned *Idx = nullptr) { + auto LocalInstructionIt = std::find(Is.begin(), Is.end(), V); + + if (LocalInstructionIt == Is.end()) + return false; + + if (Idx) + *Idx = LocalInstructionIt - Is.begin(); + + return true; + } + + /// Get the list operands of that are required for calling the function H + const std::vector &getIncomingOperands() const { + return IncomingOperands; + } + + /// If possible create the function H (including the final recursive call + /// instruction). If it succeeds buildFunctionH returns true and assigns it to + /// the member variable H. The created function is purely temporary. Its + /// lifetime is bound to this object. The created function always returns void. + bool buildFunctionH() { + if (H) + H->eraseFromParent(); + H = nullptr; + + // Get the arguments list from H + SmallVector HArgs; + for (auto &U: IncomingOperands) + HArgs.push_back(U->get()->getType()); + + // Get the function type and create it + FunctionType *HTy = FunctionType::get(Type::getVoidTy(F.getContext()), makeArrayRef(HArgs), F.isVarArg()); + H = Function::Create(HTy, GlobalValue::PrivateLinkage, F.getAddressSpace(), F.getName() + ".H" + Twine(Index), F.getParent()); + BasicBlock *NewBB = BasicBlock::Create(H->getContext(), "entry", H); + + // Value-Value map to match instruction operands between the source and clone. + std::map VVMap; + + // Define a lambda used to clone an instruction. If leave constants is true + // those are taken from the argument list if possible. + auto cloneAndAppendInst = [this, &VVMap, NewBB] (Instruction *I, bool leaveConstants) { + // We do not allow any uses outside the newly created function. Otherwise + // the function would return some non void value. + for (Use &U : I->uses()) { + if (!isLocalValue(U.getUser()) && U.getUser() != CI) { + H->eraseFromParent(); + H = nullptr; + return false; + } + } + + // Clone the instruction, assign it to the newly created function and + // adjust all operands. + Instruction *NewI = I->clone(); + VVMap[I] = NewI; + NewBB->getInstList().push_back(NewI); + for (Use &U: NewI->operands()) { + // Keep constant operands as is. + if (leaveConstants && isa(U.get())) + continue; + + // Replace other operands by their counter part in the new function + if (isLocalValue(U.get())) { + auto VVMI = VVMap.find(U.get()); + if (VVMI == VVMap.end()) { + H->eraseFromParent(); + H = nullptr; + return false; + } + + U.set(VVMI->second); + } + else { + auto IOI = std::find_if(IncomingOperands.begin(), IncomingOperands.end(), [&U] (const Use* S) -> bool { + return S->get() == U.get(); + }); + + // If the operand is not found, ignore it if and only if it is a constant. + if (IOI == IncomingOperands.end() && isa(U.get())) + continue; + assert(IOI != IncomingOperands.end() && "Operand not in argument list"); + size_t ArgIdx = IOI - IncomingOperands.begin(); + U.set(H->arg_begin() + ArgIdx); + } + } + + return true; + }; + // EOL - Lambda cloneAndAppendInst + + // Clone all instructions into the new function, + for (auto &I: Is) { + if (!cloneAndAppendInst(I, true)) + return false; + } + // but allow constant arguments for the recursive call inst. + if (!cloneAndAppendInst(CI, false)) + return false; + + // Finally, append the terminator instruction. + ReturnInst::Create(H->getContext(), NewBB); + + return true; + } + + /// Return function H and create it if necessary. + Function *getFunctionH() { + if (!H) { + buildFunctionH(); + } + return H; + } + + /// Remove duplicates within the operands while preserving the order of each + /// first occurrence. The operands a given as use. For duplicates and + /// arbitrary use is selected to be kept. + void cleanupIncomingOperands() { + std::map map; + std::vector tmp(IncomingOperands.begin(), IncomingOperands.end()); + IncomingOperands.clear(); + for (auto &O: tmp) { + bool inserted; + std::tie(std::ignore, inserted) = map.insert(std::make_pair(O->get(), O)); + if (inserted) + IncomingOperands.push_back(O); + } + } + + /// Collect all required operand for instruction I. If allowConstants is set, + /// Constants are also put on the operand list. + void analyzeOperands(User::op_range Operands, Instruction *I, bool allowConstants) { + for (auto &U: Operands) { + if (!allowConstants && isa(U.get())) + continue; + + // If the operand is a non-local value add it to the list. This covers all + // movable instructions, as well as static instructions of other previous + // candidates. For movable instructions this is feasible because they + // instructions will remain (moved). On the other hand static instruction + // are removed. Referring to those is invalid. An example is depicted in + // the sequence below. + // + // ; Candidate 0 + // call @recursive(%donotcare) + // ; ... + // ; + // ; Candidate n + // %a = load %something + // %b = load %somethingelse + // call @recursive(%a) + // ; ... + // ; + // ; Candidate m > n + // call @recursive(%b) + // + // This case is not detected within this call. However, because the + // function H of candidate n would have to return a non-void value, it is + // covered during buildFunctionH. + if (!isLocalValue(U.get())) { + IncomingOperands.push_back(&U); + } + } + } + + /// Analyze if this candidate can be feasible for this transformation. + bool analyze() { + IncomingOperands.clear(); + for (auto &I: Is) { + // In general it is not safe to replace a constant by an arbitrary value + // in the LLVM IR. allowConstants is there for set to false to avoid + // putting them onto the argument list. + analyzeOperands(I->operands(), I, false); + } + // As we know that all candidates are recursive only arg_operands are + // analyzed for the recursive call instruction. Also for CallInst + // arg_operands it is feasible and potentially profitable to replace + // constants with values. E.g.: + // + // call @recursive(%this, 0); + // call @recursive(%this, 1); + // ... + // call @recursive(%this, n); + // ret + analyzeOperands(CI->arg_operands(), CI, true); + + // Remove duplicates from operand list. + cleanupIncomingOperands(); + + // Try to encapsulate call and all static instruction into an new separate function. + if (!buildFunctionH()) + return false; + + return true; + } + + /// Test if O's function H behaves equally compared to the own function H. + bool compareFunctionH(RecursiveCall *O) const { + GlobalNumberState GN; + return FunctionComparator(H, O->H, &GN).compare() == 0; + } + + /// Erase the CallInst from the source function. + void eraseSourceCallInst() { + CI->eraseFromParent(); + CI = nullptr; + } + + /// Erase all static instructions from the source function. + void eraseSourceInstructions() { + eraseSourceCallInst(); + for (auto i = Is.rbegin(), e = Is.rend(); i != e; /* inloop */) { + (*(i++))->eraseFromParent(); + } + Is.clear(); + } + + size_t getIndex() const { + return Index; + } + + private: + /// Index, solely used for naming + size_t Index; + + /// Recursive function the candidate is called within + Function &F; + + /// Temporary function H (including the recursive call). This function is used + /// during analysis and transformation. It is never called and destroyed when + /// this object is deleted. + Function *H; + + /// Recursive call instruction. + CallInst *CI; + + /// Instructions that precede the current call. + SmallVector Is; + std::vector IncomingOperands; + }; + + /// Helper class to collect and classify used arguments and analyze arguments + /// used by function H. + struct IncomingArgument { + /// Stacked arguments are arguments that need to be modeled on the stack + /// + /// Static arguments are arguments which are passed through to all + /// recursive calls without changing position nor value. + /// + /// e.g.: + /// fn(a, b, c) { + /// fn(a, b, c); + /// fn(a + 1, a, c); + /// } + /// => StaticArguments = {c} + /// + /// Constant arguments are Constants that are the equal for each call except the first. + /// + /// UnknownKind are arguments which are not yet classified. + enum Kind { + StackedKind, + StaticKind, + ConstantKind, + UnknownKind, + }; + + IncomingArgument() : Uses(), K(UnknownKind) {} + + /// Add an argument use and update the current classification. + void addUse(const RecursiveCall *CC, Use *U, bool CheckOnly = false) { + Value *V = U->get(); + switch (K) { + case UnknownKind: + K = StackedKind; + if (isa(V)) { + K = ConstantKind; + } + else if (Argument *A = dyn_cast(V)) { + if (A == CC->getCallInst()->getArgOperand(A->getArgNo())) { + K = StaticKind; + } + } + Ty = V->getType(); + break; + + case StackedKind: + // Keep Stacked as Stacked. + break; + + case ConstantKind: + // Change kind to stacked if the new argument is no constant or if the + // constant value differs. + if (Constant *C = dyn_cast(V)) { + if (C != getValue()) + K = StackedKind; + } + else { + K = StackedKind; + } + break; + + case StaticKind: + // Change kind to stacked if the new argument is no argument or if the + // the position in the corresponding CallInst is unsuitable. + if (Argument *A = dyn_cast(V)) { + if (A != CC->getCallInst()->getArgOperand(A->getArgNo())) { + K = StackedKind; + } + } + else { + K = StackedKind; + } + break; + } + assert(Ty == V->getType() && "Argument type mismatch"); + if (!CheckOnly) + Uses.insert(std::make_pair(CC, U)); + } + + /// Get the used value of a specific candidate, or the used value of any + /// candidate. + Value *getValue(const RecursiveCall *CC = nullptr) const { + if (CC) + return Uses.find(CC)->second->get(); + else + return Uses.begin()->second->get(); + } + + /// Get the type of this argument. + Type *getType() const { + return Ty; + } + + /// Returns true it this argument is of stacked kind. + bool isStacked() const { + assert(K != UnknownKind && "Argument kind is unknown"); + return K == StackedKind; + } + + /// Returns true it this argument is of static kind. + bool isStatic() const { + assert(K != UnknownKind && "Argument kind is unknown"); + return K == StaticKind; + } + + /// Returns true it this argument is of constant kind. + bool isConstant() const { + assert(K != UnknownKind && "Argument kind is unknown"); + return K == ConstantKind; + } + + private: + /// Candidate-Use map of the argument. + std::map Uses; + + /// Classification. + Kind K; + + /// Argument type. + Type *Ty; + }; + + struct TrueMarker { + using ItemListContainer = SmallVectorImpl; + + TrueMarker(LLVMContext &C, unsigned OtherCandidatesNumOf) : C(C), OtherCandidatesNumOf(OtherCandidatesNumOf) {} + virtual ~TrueMarker() {} + + virtual int computeAllocaCost(StructType *ListTy, int Factor, TargetTransformInfo &TTI, const DataLayout &DL) { + // We factor in the stores to create the inner links + return Factor * (OtherCandidatesNumOf - 1) * TTI.getMemoryOpCost( + Instruction::Store, + // Pointer to next item + ListTy->getElementType(1)->getPointerTo(), + DL.getABITypeAlignment(ListTy->getElementType(1)), + DL.getAllocaAddrSpace()); + } + + virtual int computeFreelistManagementCost(StructType *ListTy, int Factor, TargetTransformInfo &TTI, const DataLayout &DL) { + // Store the next pointer + return Factor * TTI.getMemoryOpCost( + Instruction::Store, + ListTy->getElementType(1)->getPointerTo(), + DL.getABITypeAlignment(ListTy->getElementType(1)), + DL.getAllocaAddrSpace()); + } + + virtual int computeAllocFromFreeCost(StructType *ListTy, int Factor, TargetTransformInfo &TTI, const DataLayout &DL) { + // Load the next pointer + return Factor * TTI.getMemoryOpCost( + Instruction::Load, + ListTy->getElementType(1)->getPointerTo(), + DL.getABITypeAlignment(ListTy->getElementType(1)), + DL.getAllocaAddrSpace()); + } + + virtual int computeEliminationCost(StructType *ListTy, int Factor, TargetTransformInfo &TTI, const DataLayout &DL) { + return + // One Conditional Branch + Factor * TTI.getCFInstrCost(Instruction::Br) + + // assume 50% alloca (round up) + (computeAllocaCost(ListTy, Factor, TTI, DL) / 2) + + // assume 50% allocate from free list (round down) + (computeAllocFromFreeCost(ListTy, Factor, TTI, DL) / 2) + + // Store pointer to next item, once + Factor * TTI.getMemoryOpCost( + Instruction::Store, + ListTy->getElementType(1)->getPointerTo(), + DL.getABITypeAlignment(ListTy->getElementType(1)), + DL.getAllocaAddrSpace()) + + // Store parameters for each call itself + Factor * OtherCandidatesNumOf * TTI.getMemoryOpCost( + Instruction::Store, + ListTy->getElementType(0)->getPointerTo(), + DL.getABITypeAlignment(ListTy->getElementType(0)), + DL.getAllocaAddrSpace()); + } + + virtual int computeReturnPathCost(StructType *ListTy, int Factor, TargetTransformInfo &TTI, const DataLayout &DL) { + return + // One Conditional Branch for check return per iteration + Factor * OtherCandidatesNumOf * TTI.getCFInstrCost(Instruction::Br) + + // Free List Management per Iteration + OtherCandidatesNumOf * computeFreelistManagementCost(ListTy, Factor, TTI, DL) + + // Cost for loading the next element per iteration + Factor * OtherCandidatesNumOf * TTI.getMemoryOpCost( + Instruction::Load, + ListTy->getPointerTo(), + DL.getABITypeAlignment(ListTy), + DL.getAllocaAddrSpace()); + } + + virtual int computeTransformCost(StructType *ListTy, int Factor, TargetTransformInfo &TTI, const DataLayout &DL) { + return + // One Conditional Branch for check return + Factor * TTI.getCFInstrCost(Instruction::Br) + + computeEliminationCost(ListTy, Factor, TTI, DL) + + computeReturnPathCost(ListTy, Factor, TTI, DL); + } + + // Create the return path for the TrueMarker. + // + // Generated Code: + // + // mtce.return: + // if (worklist == nullptr) + // goto mtce.retonly; + // + // mtce.add.freelist: + // nextitem = *worklist; + // ; Freelist management code is inserted here + // goto loopentry; + // + // mtce.retonly: + // return; + // + virtual std::tuple insertReturnPath( + Instruction *ListItem, Instruction *FreeList, BasicBlock *MtceReturnBB, + PHINode *WorkListItemPHI, PHINode *FreeListRecursivePHI, + const DebugLoc &DebugLoc, const DataLayout &DL) { + + Instruction *InsertPos; + + // Start by creating a basic block that adds an item to the free list + BasicBlock *AddFreeListBB = MtceReturnBB->splitBasicBlock(MtceReturnBB->getFirstNonPHI(), + "mtce.add.freelist"); + + // Check next item for null and return if necessary + InsertPos = MtceReturnBB->getTerminator(); + ICmpInst *CmpI = new ICmpInst(InsertPos, + ICmpInst::ICMP_EQ, + ListItem, + ConstantPointerNull::get(cast(ListItem->getType())), + "mtce.worklist.is.null.tm"); + CmpI->setDebugLoc(DebugLoc); + + // Create the final return only BB and branch to if indicated by the + // comparison. + BasicBlock *ReturningBB = AddFreeListBB->splitBasicBlock(AddFreeListBB->getTerminator(), + "mtce.retonly"); + BranchInst *CRBI = BranchInst::Create(ReturningBB, AddFreeListBB, CmpI); + CRBI->setDebugLoc(DebugLoc); + ReplaceInstWithInst(MtceReturnBB->getTerminator(), CRBI); + + // Before managing the freelist the item has to be loaded in order not to + // change the next pointer of the worklist improperly. The according + // instructions is added here and the branch is updated (including the PHI node) + LoadInst *MLI = new LoadInst(ListItem, "mtce.mload", false, + DL.getABITypeAlignment(WorkListItemPHI->getType()), + AddFreeListBB->getTerminator()); + MLI->setDebugLoc(DebugLoc); + + BranchInst *ABI = cast(AddFreeListBB->getTerminator()); + BasicBlock *LoopEntry = WorkListItemPHI->getParent(); + ABI->setSuccessor(0, LoopEntry); + + WorkListItemPHI->addIncoming(MLI, AddFreeListBB); + + return std::make_tuple(ABI, ReturningBB); + } + + virtual bool mayPullFromFreeList() { + return true; + } + + virtual bool checkListType(Type *ListTy, const DataLayout &DL) { + return true; + } + + virtual void adjustStructFields(ItemListContainer &ItemList) {}; + + virtual Instruction *addComputeNextPtr(Value *ItemArray, unsigned ItemIdx, Instruction *InsertPos, + const DebugLoc &DebugLoc, const DataLayout &DL) { + Value *Idx = ConstantInt::get(DL.getIntPtrType(C, DL.getAllocaAddrSpace()), getItemIndex(ItemIdx)); + Instruction *NextPtr = GetElementPtrInst::CreateInBounds(cast(ItemArray->getType())->getElementType(), + ItemArray, + makeArrayRef(&Idx, 1), + "mtce.nextptr.for." + Twine(ItemIdx), + InsertPos); + NextPtr->setDebugLoc(DebugLoc); + return NextPtr; + } + + // Helper function to encapsulate index calculation + virtual unsigned getItemIndex(unsigned Item) { + return Item; + } + + LLVMContext &C; + const unsigned OtherCandidatesNumOf; + }; + + struct FalseMarker : public TrueMarker { + using base = TrueMarker; + + FalseMarker(LLVMContext &C, unsigned OtherCandidatesNumOf) : base(C, OtherCandidatesNumOf) {} + + bool mayPullFromFreeList() override { + return false; + } + + int computeEliminationCost(StructType *ListTy, int Factor, TargetTransformInfo &TTI, const DataLayout &DL) override { + return + // assume 50% alloca (round up) + computeAllocaCost(ListTy, Factor, TTI, DL) + + Factor * TTI.getMemoryOpCost( + Instruction::Store, + // Pointer to next item + ListTy->getElementType(1)->getPointerTo(), + DL.getABITypeAlignment(ListTy->getElementType(1)), + DL.getAllocaAddrSpace()) + + // Store parameters for the call itself + Factor * OtherCandidatesNumOf * TTI.getMemoryOpCost( + Instruction::Store, + ListTy->getElementType(0)->getPointerTo(), + DL.getABITypeAlignment(ListTy->getElementType(0)), + DL.getAllocaAddrSpace()); + } + + // Compute Cost for Transformation + int computeReturnPathCost(StructType *ListTy, int Factor, TargetTransformInfo &TTI, const DataLayout &DL) override { + return + // One Conditional Branch for check return per iteration + Factor * OtherCandidatesNumOf * TTI.getCFInstrCost(Instruction::Br) + + // Cost for loading the next element per iteration + Factor * OtherCandidatesNumOf * TTI.getMemoryOpCost( + Instruction::Load, + ListTy->getPointerTo(), + DL.getABITypeAlignment(ListTy), + DL.getAllocaAddrSpace()); + + } + + // Create the return path for the FalseMarker + // + // Generated Code: + // + // mtce.return: + // if (worklist == nullptr) + // goto mtce.retonly; + // + // mtce.loadnext: + // nextitem = *worklist + // goto loopentry; + // + // mtce.retonly: + // return + // + std::tuple insertReturnPath( + Instruction *ListItem, Instruction *FreeList, BasicBlock *MtceReturnBB, + PHINode *WorkListItemPHI, PHINode *FreeListRecursivePHI, + const DebugLoc &DebugLoc, const DataLayout &DL) override { + + Instruction *InsertPos; + + // Start by creating a basic block that adds an item to the free list + BasicBlock *LoadNextBB = MtceReturnBB->splitBasicBlock(MtceReturnBB->getFirstNonPHI(), + "mtce.loadnext"); + + // Check next item for null and return if necessary + InsertPos = MtceReturnBB->getTerminator(); + ICmpInst *CmpI = new ICmpInst(InsertPos, + ICmpInst::ICMP_EQ, + ListItem, + ConstantPointerNull::get(cast(ListItem->getType())), + "mtce.worklist.is.null.fm"); + CmpI->setDebugLoc(DebugLoc); + + // Create the final return only BB and branch to if indicated by the + // comparison. + BasicBlock *ReturningBB = LoadNextBB->splitBasicBlock(LoadNextBB->getTerminator(), + "mtce.retonly"); + + BranchInst *CRBI = BranchInst::Create(ReturningBB, LoadNextBB, CmpI); + CRBI->setDebugLoc(DebugLoc); + ReplaceInstWithInst(MtceReturnBB->getTerminator(), CRBI); + + BranchInst *LNBI = cast(LoadNextBB->getTerminator()); + BasicBlock *LoopEntry = WorkListItemPHI->getParent(); + LNBI->setSuccessor(0, LoopEntry); + + LoadInst *MLI = new LoadInst(ListItem, "mtce.mload", false, + DL.getABITypeAlignment(WorkListItemPHI->getType()), + LoadNextBB->getTerminator()); + MLI->setDebugLoc(DebugLoc); + WorkListItemPHI->addIncoming(MLI, LoadNextBB); + FreeListRecursivePHI->addIncoming(FreeList, LoadNextBB); + + return std::make_tuple(nullptr, ReturningBB); + } + }; + + struct TaggedMarker : public TrueMarker { + using base = TrueMarker; + + TaggedMarker(LLVMContext &C, unsigned OtherCandidatesNumOf) : base(C, OtherCandidatesNumOf) {} + + int computeAllocaCost(StructType *ListTy, int Factor, TargetTransformInfo &TTI, const DataLayout &DL) override { + // We factor in the stores to create the inner links + return + base::computeAllocaCost(ListTy, Factor, TTI, DL) + + // Set the the marker bit, except for the first (accounted in + // computeEliminationCost) and last (no bit to set for the marked item) + Factor * (OtherCandidatesNumOf - 2) * TTI.getArithmeticInstrCost( + Instruction::Or, DL.getIntPtrType(ListTy->getPointerTo())); + } + + int computeEliminationCost(StructType *ListTy, int Factor, TargetTransformInfo &TTI, const DataLayout &DL) override { + return + base::computeEliminationCost(ListTy, Factor, TTI, DL) + + Factor * TTI.getArithmeticInstrCost( + Instruction::Or, DL.getIntPtrType(ListTy->getPointerTo())); + } + + // Compute Cost for Transformation + int computeReturnPathCost(StructType *ListTy, int Factor, TargetTransformInfo &TTI, const DataLayout &DL) override { + return + // One and instruction to check bit 0 per iteration + Factor * OtherCandidatesNumOf * TTI.getArithmeticInstrCost( + Instruction::And, DL.getIntPtrType(ListTy->getPointerTo())) + + // One Conditional Branch for check bit 0 + Factor * OtherCandidatesNumOf * TTI.getCFInstrCost(Instruction::Br) + + // Mask bit 0, once per unmarked item + // One and instruction to check bit 0 per iteration + Factor * (OtherCandidatesNumOf - 1) * TTI.getArithmeticInstrCost( + Instruction::And, DL.getIntPtrType(ListTy->getPointerTo())) + + // Update freelist only once per block + computeFreelistManagementCost(ListTy, Factor, TTI, DL) + + // Check for return only once per block + Factor * TTI.getCFInstrCost(Instruction::Br) + + // Cost for loading the next element per iteration + Factor * OtherCandidatesNumOf * TTI.getMemoryOpCost( + Instruction::Load, + ListTy->getPointerTo(), + DL.getABITypeAlignment(ListTy), + DL.getAllocaAddrSpace()); + + } + + // Tagged marker only works if the list alignment is 2 or higher. Bit 0 is + // used to mark the last item in each block. + bool checkListType(Type *ListTy, const DataLayout &DL) override { + // Check if the last bit in the address is available + return (DL.getABITypeAlignment(ListTy) % 2 == 0); + } + + // Bit 0 in the next pointer in the previous items encodes the marker bit. + // If this is 1 then a further item exist in this allocation block, + // otherwise the last item is reached and the block can be added to the free + // list. + Instruction *addComputeNextPtr(Value *ItemArray, unsigned ItemIdx, Instruction *InsertPos, + const DebugLoc &DebugLoc, const DataLayout &DL) override { + Instruction *UntaggedNextPtr = base::addComputeNextPtr(ItemArray, ItemIdx, + InsertPos, DebugLoc, DL); + if (ItemIdx == OtherCandidatesNumOf - 1) + return UntaggedNextPtr; + + IntegerType *IntPtrType = cast(DL.getIntPtrType(UntaggedNextPtr->getType())); + Instruction *PtrToInt = new PtrToIntInst(UntaggedNextPtr, + IntPtrType, + "mtce.ptoi." + Twine(ItemIdx), InsertPos); + PtrToInt->setDebugLoc(DebugLoc); + APInt Mark(IntPtrType->getBitWidth(), 1); + Instruction *TaggedIntPtr = BinaryOperator::Create(Instruction::Or, + PtrToInt, ConstantInt::get(C, Mark), + "mtce.listhead.msked.intptr", + InsertPos); + Instruction *NextPtr = new IntToPtrInst(TaggedIntPtr, UntaggedNextPtr->getType(), + "mtce.tagged.next.for." + Twine(ItemIdx), + InsertPos); + NextPtr->setDebugLoc(DebugLoc); + return NextPtr; + } + + // Create the return path for the TaggedMarker. + // + // Generated Code: + // + // mtce.return: + // intptr_t worlist_int = (intptr_t)worklist; + // if ( worklist_int & 1 == 0) + // goto mtce.check_return; + // + // mtce.unmarked: + // nextitem = *(worklist_ty*)(worklist_int & ~1); + // goto loopentry; + // + // mtce.add.freelist: + // nextitem = *worklist; + // ; Freelist management code is inserted here + // goto loopentry; + // + // mtce.retonly: + // return; + // + std::tuple insertReturnPath( + Instruction *ListItem, Instruction *FreeList, BasicBlock *MtceReturnBB, + PHINode *WorkListItemPHI, PHINode *FreeListRecursivePHI, + const DebugLoc &DebugLoc, const DataLayout &DL) override { + LLVMContext &C = MtceReturnBB->getContext(); + Instruction *InsertPos; + BasicBlock *LoopEntry = WorkListItemPHI->getParent(); + + // Start by creating a basic block to check the marker bit + BasicBlock *CheckReturnBB = MtceReturnBB->splitBasicBlock(MtceReturnBB->getFirstNonPHI(), + "mtce.check.return"); + InsertPos = MtceReturnBB->getTerminator(); + IntegerType *IntPtrType = cast(DL.getIntPtrType(ListItem->getType())); + + // Convert the pointer into integer to to perform logical operations + Instruction *PtrToInt = new PtrToIntInst(ListItem, IntPtrType, + "mtce.ptoi", InsertPos); + + PtrToInt->setDebugLoc(DebugLoc); + Instruction *Mark = new TruncInst(PtrToInt, IntegerType::get(C, 1), "mtce.mark", InsertPos); + + // Mask the tag bit and convert it back int pointer type + APInt Mask = IntPtrType->getMask(); + Mask.clearBit(0); + Instruction *MWLIntPtr = BinaryOperator::Create(Instruction::And, + PtrToInt, ConstantInt::get(C, Mask), + "mtce.masked.worklist.intptr", + InsertPos); + MWLIntPtr->setDebugLoc(DebugLoc); + Instruction *MWL = new IntToPtrInst(MWLIntPtr, ListItem->getType(), + "mtce.masked.worklist", + InsertPos); + MWL->setDebugLoc(DebugLoc); + + // Create a branch target for unmarked items, and replace the terminator + // by the conditional version. + BasicBlock *UnmarkedItemBB = BasicBlock::Create(C, "mtce.unmarked", + CheckReturnBB->getParent(), + CheckReturnBB); + BranchInst *MarkBI = BranchInst::Create(UnmarkedItemBB, CheckReturnBB, Mark); + MWL->setDebugLoc(DebugLoc); + ReplaceInstWithInst(MtceReturnBB->getTerminator(), MarkBI); + + // When reaching unmarked items, just load the item and branch into the + // loop entry + LoadInst *UMLI = new LoadInst(MWL, "mtce.umload", false, + DL.getABITypeAlignment(WorkListItemPHI->getType()), + UnmarkedItemBB); + + UMLI->setDebugLoc(DebugLoc); + + BranchInst *UMBI = BranchInst::Create(LoopEntry, UnmarkedItemBB); + UMBI->setDebugLoc(DebugLoc); + + WorkListItemPHI->addIncoming(UMLI, UnmarkedItemBB); + FreeListRecursivePHI->addIncoming(FreeList, UnmarkedItemBB); + + // The code generated above handles the case for marked items. + // Next split if a block for checking the return condition. + // + // Note, the unmasked version of the worklist pointer can be used, since + // we know that the tag bit is zero. + BasicBlock *AddFreeListBB = CheckReturnBB->splitBasicBlock(CheckReturnBB->getFirstNonPHI(), + "mtce.add.freelist"); + InsertPos = CheckReturnBB->getTerminator(); + ICmpInst *CmpI = new ICmpInst(InsertPos, + ICmpInst::ICMP_EQ, + ListItem, + ConstantPointerNull::get(cast(ListItem->getType())), + "mtce.worklist.is.null.tg"); + CmpI->setDebugLoc(DebugLoc); + + // Create the final return only BB and branch to if indicated by the + // comparison. + BasicBlock *ReturningBB = AddFreeListBB->splitBasicBlock(AddFreeListBB->getTerminator(), + "mtce.retonly"); + BranchInst *CRBI = BranchInst::Create(ReturningBB, AddFreeListBB, CmpI); + CRBI->setDebugLoc(DebugLoc); + ReplaceInstWithInst(CheckReturnBB->getTerminator(), CRBI); + + // Before managing the freelist the item has to be loaded in order not to + // change the next pointer of the worklist improperly. The according + // instructions is added here and the branch is updated (including the PHI node) + LoadInst *MLI = new LoadInst(ListItem, "mtce.mload", false, + DL.getABITypeAlignment(WorkListItemPHI->getType()), + AddFreeListBB->getTerminator()); + MLI->setDebugLoc(DebugLoc); + + BranchInst *ABI = cast(AddFreeListBB->getTerminator()); + ABI->setSuccessor(0, LoopEntry); + + WorkListItemPHI->addIncoming(MLI, AddFreeListBB); + + return std::make_tuple(ABI, ReturningBB); + } + }; + + typedef std::forward_list CandidateContainer; + + /// Current DataLayout. + const DataLayout &DL; + + /// Current Context. + LLVMContext &C; + + /// Target Information + TargetTransformInfo &TTI; + + /// Optimization Remark Emitter to use. + OptimizationRemarkEmitter &ORE; + + /// Function the pass is working on. + Function &F; + + /// Alias Analysis of the current function. + AliasAnalysis &AA; + + /// Type of list elements for the selected marker algorithm; + StructType *ListTy; + + /// List of call candidates to be eliminated, this list contains the single + /// first candidate. + CandidateContainer FirstCandidate; + + /// List of the remaining call candidates to be eliminated. + CandidateContainer OtherCandidates; + + /// Number of elements in the OtherCandidates List + unsigned OtherCandidatesNumOf; + + /// List of arguments for the function H. + SmallVector IncomingArguments; + + /// List of movable instructions. + SmallVector MovableInst; + + /// List of static instructions. + SmallVector StaticInst; + + /// Selected Marker Algorithm + std::unique_ptr Marker; + + /// Find all recursive calls within the current basic block and add them to + /// the list of candidates. + /// + /// \returns true if at leas one recursive call is found. + bool appendAllRecursiveCallsInBB(Function &F, + BasicBlock &BB, + unsigned &NumOf, + CandidateContainer &Candidates, + CandidateContainer::iterator &LastCandidate) { + bool found = false; + for (auto &I: BB) { + CallInst *CI = dyn_cast(&I); + if (CI && CI->getCalledFunction() == &F) { + LastCandidate = Candidates.emplace_after(LastCandidate, CI, NumOf); + NumOf++; + found = true; + } + } + return found; + } + + /// Test if the given basic block is a return only BB. + bool isReturnOnlyBB(const BasicBlock &BB) const { + const ReturnInst *RI = dyn_cast(BB.getTerminator()); + return (RI != nullptr && BB.getFirstNonPHIOrDbg() == RI); + } + + /// Return true if it is safe to move the specified + /// instruction from after the call to before the call, assuming that all + /// instructions between the call and this instruction are movable. + /// + /// This is mostly a copy of canMoveAboveCall within TailCallElimination. + /// + /// \returns true if the instruction can be moved. + bool canMoveAbove(Instruction *I, Instruction *SI) { + // FIXME: We can move load/store/call/free instructions above the call if the + // call does not mod/ref the memory location being processed. + if (I->mayHaveSideEffects()) // This also handles volatile loads. + return false; + + if (LoadInst *L = dyn_cast(I)) { + // Loads may always be moved above calls without side effects. + if (SI->mayHaveSideEffects()) { + const DataLayout &DL = L->getModule()->getDataLayout(); + if (isModSet(AA.getModRefInfo(SI, MemoryLocation::get(L))) || + !isSafeToLoadUnconditionally(L->getPointerOperand(), + L->getAlignment(), DL, L)) + return false; + } + } + + // Otherwise, if this is a side-effect free instruction, check to make sure + // that it does not use the return value of the call. If it doesn't use the + // return value of the call, it must only use things that are defined before + // the call, or movable instructions between the call and the instruction + // itself. + return !is_contained(I->operands(), SI); + } + + /// Analyze the current function and return true if it is feasible and + /// profitable to apply the transformation. + /// + /// \returns the number of candidates found. + bool analyzeFunction() { + // Number of basic blocks with all recursive calls + int BBwithCandidates = 0; + bool hasAllocas = false; + + CandidateContainer Candidates; + auto LastCandidate = Candidates.before_begin(); + + // We cannot handle variadic arguments. + if (F.isVarArg()) + return false; + + // TODO (improvement): There might be ways to implement non void returns. For example + // similar to accumulator TREs. + if (F.getReturnType() != Type::getVoidTy(F.getContext())) + return false; + + // No, can't be done. + if (F.callsFunctionThatReturnsTwice()) + return false; + + unsigned CandidatesNumOf = 0; + for (auto &BB: F) { + // EH pads cannot be handled at all + if (BB.isEHPad()) { + return false; + } + + // Alloca instructions are not allowed. + // + // TODO (improvement): Alloca can be theoretically be handled if it can be + // proven that the used memory lifetime is bounded before the first call. + // In this case the alloca can be moved into the new entry block the same + // way tail call elimination does. + for (auto &I: BB) { + if (isa(&I)) { + hasAllocas = true; + } + } + + if (isa(BB.getTerminator())) { + // For returning blocks find all candidates. + if (appendAllRecursiveCallsInBB(F, BB, CandidatesNumOf, Candidates, LastCandidate)) { + BBwithCandidates++; + } + } + else if (BranchInst *BI = dyn_cast(BB.getTerminator())) { + // For non-returning blocks find all candidates, iff it has a single + // successor which is a return only block. + if (BI->isUnconditional() && isReturnOnlyBB(*BI->getSuccessor(0))) { + if (appendAllRecursiveCallsInBB(F, BB, CandidatesNumOf, Candidates, LastCandidate)) { + BBwithCandidates++; + } + } + } + + // Currently only one BB with tail recursive calls is allowed. + // + // TODO (improvement): There might be interesting cases with recursive + // calls in more then one BB. + if (BBwithCandidates > 1) { + ORE.emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "Multi tail call elimination missed", + F.getEntryBlock().front().getDebugLoc(), &F.getEntryBlock()) + << ore::NV("Reason", "Multiple Basic Blocks"); + }); + return false; + } + } + + // No MTCE required. Leave to TRE. + if (CandidatesNumOf < 2) { + return false; + } + + if (hasAllocas) { + ORE.emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "Multi tail call elimination missed", + F.getEntryBlock().front().getDebugLoc(), &F.getEntryBlock()) + << ore::NV("Reason", "Functions containing alloca instructions not yet supported"); + }); + return false; + } + + BasicBlock *BB = Candidates.front().getCallInst()->getParent(); + Instruction *TI = BB->getTerminator(); + + // All instructions between the first candidate and the last candidate are + // partitioned as static (unmovable) instructions and instructions (movable) + // The resulting two arrays are ordered in the required execution sequence. + // + // This means that the execution order (MovableInst | StaticInst) is legal + // and equivalent to the original sequence. + // + // Algorithm: + // + // * Iterate over all instructions between Candidate[n] and Candidate[n+1]. + // + // * Test if the instruction can be moved above all identified static + // instructions. + // + // * If it can, add it to the movable instruction list, otherwise add it to + // the static instruction list. + // + // * Consecutively add static instructions if necessary. + + // Iterate over all instruction the interval + // )Candidate[x]. Candidate[x+1]). Virtually include the terminator as + // additional candidate. + // + // Setup the first iteration interval between Candidate[0] and Candidate[1]. + // Note: The list has at least two members. + auto Cand0 = Candidates.begin(); + auto Cand1 = next(Cand0); + BasicBlock::iterator BBI(Cand0->getCallInst()); + auto CandI = Cand1; + + // The first candidate is always a single call which is a static instruction. + StaticInst.push_back(&*BBI); + + while (true) { + // Identity the instruction that marks the end of the current pair. + // + // It is either the CandI, or the terminator instruction (which is virtually added) + Instruction *E; + if (CandI != Candidates.end()) { + E = CandI->getCallInst(); + } + else { + E = TI; + } + + // advance the iterator first, because we start with the first call + // instruction which is already added. + ++BBI; + + // If we reached the terminator we are done. + if (&*BBI == TI) + (&*BBI == E) { + StaticInst.push_back(&*BBI); + // Advance the current iteration interval to next candidate. + ++CandI; + continue; + } + + // Test if the instruction can be moved and add it accordingly. + bool movable = true; + for (auto &SI: StaticInst) { + if (canMoveAbove(&*BBI, &*SI)) + continue; + movable = false; + break; + } + + if (movable) + MovableInst.push_back(&*BBI); + else + StaticInst.push_back(&*BBI); + } + + // Assign all static instruction to the candidates. We do it naively by just + // assigning the instruction in between to candidates to second one. + auto SI = StaticInst.begin(); + for (auto &Cand: Candidates) { + while (*SI != Cand.getCallInst()) { + Cand.addInstruction(*SI); + SI++; + } + // Skip candidate's call inst. + SI++; + } + + // If not all static instructions have been assigned, then there are unmovable tailing + // instructions and the transformation is abandoned. + if (SI != StaticInst.end()) { + ORE.emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "Multi tail call elimination missed", *SI) + << ore::NV("Reason", "Instructions after last call."); + }); + return false; + } + + // Analyze each call candidate if the call and the corresponding static + // instructions are suitable to be eliminated. + for (auto &Cand: Candidates) { + if (!Cand.analyze()) { + ORE.emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "Multi tail call elimination missed", Cand.getCallInst()) + << ore::NV("Reason", "RecursiveCall not viable"); + }); + return false; + } + } + + // Test if the functions generated during analysis (step before) behave equivalently. + for (auto CCI = std::next(Cand1), CCE = Candidates.end(); + CCI != CCE; ++CCI) { + + // Test if the function H(...) of both candidates match. + if (!CCI->compareFunctionH(&*Cand1)) { + ORE.emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "Multi tail call elimination missed", CCI->getCallInst()) + << ore::NV("Reason", "RecursiveCall nonviable (Function H do not match)"); + }); + return false; + } + } + + // Analyze the operands for each function H and categorize them into + // Stacked, Constant and Static. + IncomingArguments.resize(Cand1->getIncomingOperands().size(), IncomingArgument()); + for (auto CCI = Cand1, CCE = Candidates.end(); CCI != CCE; ++CCI) { + assert(IncomingArguments.size() == CCI->getIncomingOperands().size() && "Operand size mismatch"); + + auto IAI = IncomingArguments.begin(); + for (Use *U: CCI->getIncomingOperands()) { + IAI->addUse(&*CCI, U); + + // If the value is an Argument we need to check the position in the + // first candidate too. It is sufficient here to just execute the checks. + if (isa(U->get())) { + IAI->addUse(&*Cand0, U, true); + } + ++IAI; + } + } + + // Count the categories. + int NumStaticArguments = 0; + int NumConstantArguments = 0; + for (auto &IA: IncomingArguments) { + if (IA.isConstant()) + NumConstantArguments++; + if (IA.isStatic()) + NumStaticArguments++; + } + + + // If we do not spare at least to arguments from the list, no improvement is + // expected so the analysis bails with negative result. + // TODO: This might be too defensive as further checks are added below + if (IncomingArguments.size() - NumStaticArguments - NumConstantArguments + 2 > F.arg_size()) { + LLVM_DEBUG(dbgs() << "Will not apply - no argument reduction.\n"); + ORE.emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "Multi tail call elimination missed", Cand0->getCallInst()) + << ore::NV("Reason", "No argument reduction."); + }); + return false; + } + + OtherCandidatesNumOf = CandidatesNumOf - 1; + + // Select a suitable marker algorithm + if (DisableFreeList) { + Marker.reset(new FalseMarker(C, OtherCandidatesNumOf)); + } + else { + if (OtherCandidatesNumOf == 1) { + Marker.reset(new TrueMarker(C, OtherCandidatesNumOf)); + } + else { + Marker.reset(new TaggedMarker(C, OtherCandidatesNumOf)); + } + } + + ListTy = createListType(); + + // Check if the marker is compatible with the ListTy. e.g.: TaggedMarker + // require a list alignment of 2 or multiples of 2. + if (!Marker->checkListType(ListTy, DL)) { + LLVM_DEBUG(dbgs() << "Will not apply - ListTy incompatible.\n"); + ORE.emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "Multi tail call elimination missed", Cand0->getCallInst()) + << ore::NV("Reason", "Invalid ListTy."); + }); + return false; + } + + // Just a factor to avoid rounding errors. + int Factor = 2; + + int OldCost = 0; + for (auto &Cand: Candidates) + OldCost += TTI.getInstructionCost(Cand.getCallInst(), TargetTransformInfo::TCK_Latency); + OldCost *= Factor; + + int NewCost = Marker->computeTransformCost(ListTy, Factor, TTI, DL); + + if (NewCost >= OldCost) { + LLVM_DEBUG(dbgs() << "Will not apply - too costly\n"); + ORE.emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "Multi tail call elimination missed", Cand0->getCallInst()) + << ore::NV("Reason", "Too costly") + << ore::NV("OldCost", OldCost) + << ore::NV("NewCost", NewCost); + }); + return false; + } + + // Splice the list into the two output lists. One contains the first element + // only, the other containing the remaining elements. + OtherCandidates.splice_after(OtherCandidates.before_begin(), Candidates, + Candidates.begin(), Candidates.end()); + FirstCandidate.splice_after(FirstCandidate.before_begin(), Candidates); + + return true; + } + + /// Creates the worklist structure used to model the pending call list. + StructType *createListType() const { + // Collect the types of the operands + SmallVector ListElements; + for (auto &IA: IncomingArguments) { + if (IA.isStacked()) + ListElements.push_back(IA.getType()); + } + + // Create a new StructType representing the list. + StructType *ListTy = StructType::create(C, "mtce.listty"); + StructType *OperandsTy = StructType::get(C, makeArrayRef(ListElements), false); + + SmallVector LinkItems; + LinkItems.push_back(ListTy->getPointerTo()); + Marker->adjustStructFields(LinkItems); + StructType *LinkTy = StructType::get(C, makeArrayRef(LinkItems), false); + + Type *ListItemTys[2] = { OperandsTy, LinkTy }; + ListTy->setBody(makeArrayRef(ListItemTys, 2), false); + + return ListTy; + } + + /// Prepare the entry block by splitting of the loop entry. The following + /// PHINodes are added to the loop entry. + /// + /// * One for each argument (Also the uses are adjusted). + /// * List head representing the root of the worklist. + /// * If enabled the root to the free list. + template + std::tuple + prepareEntry(ArgumentPHIContainer &ArgumentPHIs) { + const DebugLoc &DebugLoc = FirstCandidate.front().getCallInst()->getDebugLoc(); + + // Split the entry block at the first viable instruction. + BasicBlock *Entry = &F.getEntryBlock(); + BasicBlock *LoopEntry = Entry->splitBasicBlock(Entry->getFirstNonPHI(), "mtce.loop"); + Entry->getTerminator()->setDebugLoc(DebugLoc); + + // Setup List PHI nodes for the loop entry. + Instruction *InsertPos = &LoopEntry->front(); + PHINode *ListHead = PHINode::Create(ListTy->getPointerTo(), 2, + "mtce.listhead", + InsertPos); + ListHead->addIncoming(ConstantPointerNull::get(ListTy->getPointerTo()), Entry); + + // Setup the FreeList if necessary. + PHINode *FreeList = nullptr; + FreeList = PHINode::Create(ListTy->getPointerTo(), 2, + "mtce.freelist", + InsertPos); + FreeList->addIncoming(ConstantPointerNull::get(ListTy->getPointerTo()), Entry); + + // Setup Argument PHI nodes for the loop entry - see TRE. + for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); + I != E; ++I) { + PHINode *PN = PHINode::Create(I->getType(), 2, + "mtce." + I->getName(), InsertPos); + I->replaceAllUsesWith(PN); // Everyone use the PHI node now! + PN->addIncoming(&*I, Entry); + ArgumentPHIs.push_back(PN); + } + + return std::make_tuple(ListHead, FreeList, LoopEntry); + } + + /// Adjust all ArgumentPHIs according to the CallInst to be eliminated. + /// + /// Arguments are taken from CI. IncomingBB is used as the BB to add into to + /// PHI list. + template + void adjustPHIsFromCallInst( + BasicBlock *IncomingBB, CallInst *CI, + const ArgumentPHIContainer &ArgumentPHIs) { + // Adjust PHI Nodes in the loop entry + CallInst::op_iterator Arg = CI->arg_begin(); + for (auto &PHI: ArgumentPHIs) { + assert(Arg != CI->arg_end() && "Callsite does not provide enough arguments."); + PHI->addIncoming(Arg->get(), IncomingBB); + Arg++; + } + assert(Arg == CI->arg_end() && "Callsite provided too many arguments."); + } + + /// Eliminates the first call just like it is done within tail call + /// elimination. Additionally it moves all movable instructions before the + /// first call candidate, and it splits the remaining calls into a separate + /// basic block. ArgumentPHIs are adjusted accordingly. + // + // calling.bb: + // ... + // call @recursive ... + // + // %movable1 = add ... + // %static1 = call @fn1, %movable1 + // call @recursive ... + // + // %movable2 = add ... + // %static2 = call @fn1, %movable2 + // call @recursive ... + // + // br ret + // + // -> + // + // calling.bb: + // ... + // %movable1 = add ... + // %movable2 = add ... + // br %loopentry + // + // H1: + // + // %static1 = call @fn1, %movable1 + // call @recursive ... + // + // %static2 = call @fn1, %movable2 + // call @recursive ... + // + // br ret + template + std::tuple + eliminateFirstCall(RecursiveCall &CC, + BasicBlock *LoopEntry, + const ArgumentPHIContainer &ArgumentPHIs) { + CallInst *CI = CC.getCallInst(); + + // Move all movable instruction before this call instruction. + for (auto &I: MovableInst) + I->moveBefore(CI); + + // Split off the remaining static (unmovable instructions). + BasicBlock *CallerBB = CI->getParent(); + // For now MtceReturnBB still contains all static instructions. These will be + // be removed and this BB becomes the new return block. + BasicBlock *MtceReturnBB = CallerBB->splitBasicBlock(CI, "mtce.return"); + BranchInst *BI = cast(CallerBB->getTerminator()); + + // Create the loop to eliminate the call the the first candidate. + BI->setSuccessor(0, LoopEntry); + BI->setDebugLoc(CI->getDebugLoc()); + + adjustPHIsFromCallInst(CallerBB, CI, ArgumentPHIs); + CC.eraseSourceCallInst(); + + return std::make_tuple(CallerBB, MtceReturnBB); + } + + /// Adds code to bypass the alloca (inserting by eliminateOtherCalls) with + /// code to picking it from the freelist. + // + // calling.bb: + // ... + // %movable.0 = add ... + // %movable.k = add ... + // + // ; allocate memory on stack + // %itemptr = alloca %listty, 2 + // + // ; compute address if list items + // %gep.0 = getelementptr %listty, 0 + // ... + // %gep.k = getelementptr %listty, k + // + // ; setup and store list item [0]; set mark to false + // mark.0 = + // %nextptr.0 = getelementptr %listty, itemptr, 0, 1, 0 + // store %gep.1, *nextptr.0 + // ... + // mark.k = + // %nextptr.k = getelementptr %listty, itemptr, k, 1, 0 + // store %listhead, %nextptr.k + // + // ; setup and store operands entries + // %opreands.0 = getelementptr %listty, itemptr, 0, 0 + // %arg.0-0 = insertvalue mark_0, undef, %movable.0-0 + // store %arg.0-0, %operands.0 + // ... + // + // -> [AI = %itemptr, AllocaEnd = %nextptr.k, NewHeadPtr = %gep.0] + // + // calling.bb: + // ... + // %movable.0 = add ... + // %movable.k = add ... + // + // alloca: + // ; allocate memory on stack + // %itemptr = alloca %listty, 2 + // + // ; compute address if list items + // %gep.0 = getelementptr %listty, 0 + // ... + // %gep.k = getelementptr %listty, k + // + // ; setup and store list item [0]; set mark to false + // mark.0 = + // %nextptr.0 = getelementptr %listty, %itemptr, 0, 1, 0 + // store %gep.1, *nextptr.0 + // ... + // mark.k = + // + // allocfromfree: + // %gep.0.fromfree = getelementptr %listty, %freelist, ... + // %nextptr = getelementptr %listty, 0, #STRUCT_IDX_OFNEXT + // %next = load %nextptr + // br calling.bb.1 + // + // calling.bb.1: + // %newheadptrphi = phi %listty, [%gep.0, %allocaa], [%gep.0.fromfree.fromfree, %allocfromfree] + // %allocated = phi %listty, [%itemptr, %alloca], [%freelist, %allocfromfree] + // %newfreelist = phi %listty, [%freelist, %alloca], [%next, %allocfromfree] + // + // %nextptr.k = getelementptr %listty, %allocated, k, 1, 0 + // store %listhead, %nextptr.k + // + // ; setup and store operands entries + // %opreands.0 = getelementptr %listty, %allocated, 0, 0 + // %arg.0-0 = insertvalue mark_0, undef, %movable.0-0 + // store %arg.0-0, %operands.0 + // ... + void replaceAllocaWithFreeList(AllocaInst *AI, Instruction *AllocaEnd, + Instruction *NewHeadPtr, PHINode *FreeList) { + // Never generate code to pull from free list if Marker does not require it. + if (!Marker->mayPullFromFreeList()) { + FreeList->addIncoming(FreeList, AI->getParent()); + return; + } + + const DebugLoc &DebugLoc = AI->getDebugLoc(); + BasicBlock *PreCallingBB = AI->getParent(); + BasicBlock::iterator InsertPos(AI); + + // Split and create BBs. + BasicBlock *AllocaBB = PreCallingBB->splitBasicBlock(InsertPos, "mtce.alloca"); + InsertPos = BasicBlock::iterator(AllocaEnd); + BasicBlock *CallingBB = AllocaBB->splitBasicBlock(InsertPos, PreCallingBB->getName() + ".1"); + BasicBlock *PullBB = BasicBlock::Create(C, "mtce.allocfromfree", &F, CallingBB); + + // Compute address + Value *IdxNext[] = { + ConstantInt::get(DL.getIntPtrType(C, DL.getAllocaAddrSpace()), Marker->getItemIndex(0)), + ConstantInt::get(IntegerType::get(C, 32), 1), + ConstantInt::get(IntegerType::get(C, 32), 0), + }; + Instruction *PulledItem0 = GetElementPtrInst::CreateInBounds(ListTy, + FreeList, + makeArrayRef(IdxNext, 1), + "mtce.pulleditemgep.0", + PullBB); + PulledItem0->setDebugLoc(DebugLoc); + + IdxNext[0] = ConstantInt::get(DL.getIntPtrType(C, DL.getAllocaAddrSpace()), Marker->getItemIndex(OtherCandidatesNumOf - 1)); + Instruction *NewFreePtr = GetElementPtrInst::CreateInBounds(ListTy, + FreeList, + makeArrayRef(IdxNext, 3), + "mtce.freelist.nextptr", + PullBB); + NewFreePtr->setDebugLoc(DebugLoc); + + // Load the next free item address + LoadInst *NewFree = new LoadInst(NewFreePtr, + "mtce.newfreelist", + false, + DL.getABITypeAlignment(ListTy->getPointerTo()), + PullBB); + NewFree->setDebugLoc(DebugLoc); + + // Create branches + BranchInst *BI = BranchInst::Create(CallingBB, PullBB); + BI->setDebugLoc(DebugLoc); + + InsertPos = BasicBlock::iterator(CallingBB->getFirstNonPHI()); + PHINode *AllocatedPHI = PHINode::Create(ListTy->getPointerTo(), 2, + "mtce.allocated", + &*InsertPos); + AllocatedPHI->setDebugLoc(DebugLoc); + + // Replace all remaining uses of AI with the new PHInode + for (auto i = AI->use_begin(), e = AI->use_end(); i != e; /* in loop */) { + Use &U = *(i++); + if (Instruction *I = dyn_cast(U.getUser())) { + if (I->getParent() == CallingBB) + U.set(AllocatedPHI); + } + } + + AllocatedPHI->addIncoming(AI, AllocaBB); + AllocatedPHI->addIncoming(FreeList, PullBB); + + PHINode *UpdatedFreeList = PHINode::Create(ListTy->getPointerTo(), 2, + "mtce.freelist.next", + &*InsertPos); + UpdatedFreeList->setDebugLoc(DebugLoc); + UpdatedFreeList->addIncoming(FreeList, AllocaBB); + UpdatedFreeList->addIncoming(NewFree, PullBB); + FreeList->addIncoming(UpdatedFreeList, UpdatedFreeList->getParent()); + + InsertPos = BasicBlock::iterator(PreCallingBB->getTerminator()); + ICmpInst *CmpI = new ICmpInst(&*InsertPos, + ICmpInst::ICMP_EQ, + FreeList, + ConstantPointerNull::get(ListTy->getPointerTo()), + "mtce.cmp_freelist_empty"); + CmpI->setDebugLoc(DebugLoc); + + BI = BranchInst::Create(AllocaBB, PullBB, CmpI); + BI->setDebugLoc(DebugLoc); + ReplaceInstWithInst(PreCallingBB->getTerminator(), BI); + + return; + } + + /// Eliminate other recursive calls by adding the structures to the itemlist. + // + // calling.bb: + // ... + // %movable.0 = add ... + // %movable.k = add ... + // br %loopentry + // + // -> + // + // calling.bb: + // ... + // %movable.0 = add ... + // %movable.k = add ... + // + // ; allocate memory on stack + // %itemptr = alloca %listty, 2 + // + // ; compute address if list items + // %gep_0 = getelementptr %listty, 0 + // ... + // %gep_k = getelementptr %listty, k + // + // ; setup and store list item [0]; set mark to false + // mark.0 = + // %nextptr.0 = getelementptr %listty, itemptr, 0, 1, 0 + // store %gep.1, *nextptr.0 + // ... + // mark.k = + // %nextptr.k = getelementptr %listty, itemptr, k, 1, 0 + // store %listhead, %nextptr.k + // + // ; setup and store operands entries + // %opreands.0 = getelementptr %listty, itemptr, 0, 0 + // %arg.0-0 = insertvalue mark_0, undef, %movable.0-0 + // store %arg.0-0, %operands.0 + // ... + // %opreands.k = getelementptr %listty, itemptr, k, 0 + // %arg.k-0 = insertvalue mark_1, undef, %movable.k-0 + // store %arg.k-0, %operands.k + // + // br %loopentry + // + // ; below a reference to the source instruction + // + // H1: + // + // %static.0 = call @fn1, %movable.0 + // call @recursive ... + // + // %static.k = call @fn1, %movable.k + // call @recursive ... + // + // br ret + void eliminateOtherCalls(CandidateContainer &OtherCandidates, + BasicBlock *CallerBB, + PHINode *ListHead, + PHINode *FreeList) { + Instruction *InsertPos = CallerBB->getTerminator(); + Type *OperandsTy = ListTy->getTypeAtIndex(0U); + + // Add an entry to the list for all candidates except the first. This to + // eliminates the calls to those candidates. + AllocaInst *AllocaPtr = new AllocaInst(ListTy, + DL.getAllocaAddrSpace(), + ConstantInt::get(DL.getIntPtrType(C, DL.getAllocaAddrSpace()), OtherCandidatesNumOf), + DL.getABITypeAlignment(ListTy), + "mtce.itemptr", + InsertPos); + AllocaPtr->setDebugLoc(InsertPos->getDebugLoc()); + + // Fill list link structure with the necessary values. With a freelist + // available all values except the last nextptr are constant during this + // execution instance. The address values are all relative to the allocated + // address, and the last mark (if any) also do not need to change when + // pulling the element from the freelist. AllocaEnd marks the first + // instruction that must be executed when pulling from the freelist. + Instruction *AllocaEnd = nullptr; + unsigned idx = 0; + for (auto &Cand: OtherCandidates) { + const DebugLoc &DebugLoc = Cand.getCallInst()->getDebugLoc(); + + Value *Idx[3] = { + ConstantInt::get(DL.getIntPtrType(C, DL.getAllocaAddrSpace()), Marker->getItemIndex(idx)), + ConstantInt::get(IntegerType::get(C, 32), 1), + ConstantInt::get(IntegerType::get(C, 32), 0) }; + + Instruction *LinkPtrGEP = GetElementPtrInst::CreateInBounds(ListTy, + AllocaPtr, + makeArrayRef(Idx, 2), + "mtce.linkptr." + Twine(idx), + InsertPos); + LinkPtrGEP->setDebugLoc(Cand.getCallInst()->getDebugLoc()); + + Instruction *NextPtrGEP = GetElementPtrInst::CreateInBounds(ListTy, + AllocaPtr, + makeArrayRef(Idx, 3), + "mtce.nextptr." + Twine(idx), + InsertPos); + NextPtrGEP->setDebugLoc(Cand.getCallInst()->getDebugLoc()); + + Value *NextVal; + if (idx < OtherCandidatesNumOf - 1) { + NextVal = Marker->addComputeNextPtr(AllocaPtr, idx + 1, InsertPos, + Cand.getCallInst()->getDebugLoc(), DL); + } + else { + NextVal = ListHead; + AllocaEnd = NextPtrGEP; + } + + StoreInst *SNPI = new StoreInst(NextVal, + NextPtrGEP, + false, + DL.getABITypeAlignment(NextVal->getType()), + InsertPos); + SNPI->setDebugLoc(DebugLoc); + + idx++; + } + + // Fill the operands part of the ListItems + idx = 0; + for (auto &Cand: OtherCandidates) { + const DebugLoc &DebugLoc = Cand.getCallInst()->getDebugLoc(); + + unsigned ValueIdx = 0; + Instruction *OperandsVal = nullptr; + for (auto &IA: IncomingArguments) { + if (!IA.isStacked()) + continue; + + Value *ValueToStore = IA.getValue(&Cand); + + OperandsVal = InsertValueInst::Create((OperandsVal ? cast(OperandsVal) : cast(UndefValue::get(OperandsTy))), + ValueToStore, + makeArrayRef(&ValueIdx, 1), + "mtce.arg." + Twine(idx) + ".Op" + Twine(ValueIdx), + InsertPos); + OperandsVal->setDebugLoc(DebugLoc); + ValueIdx++; + } + + // OperandsVal might be null if no operands have to be stored in the list. + if (OperandsVal) { + // Create the GEP that points to the operand struct + Value *Idx[2] = { + ConstantInt::get(DL.getIntPtrType(C, DL.getAllocaAddrSpace()), Marker->getItemIndex(idx)), + ConstantInt::get(IntegerType::get(C, 32), 0) }; + Instruction *OperandsGEP = GetElementPtrInst::CreateInBounds(ListTy, + AllocaPtr, + makeArrayRef(Idx, 2), + "mtce.operandsgep." + Twine(idx), + InsertPos); + OperandsGEP->setDebugLoc(Cand.getCallInst()->getDebugLoc()); + + StoreInst *SOVI = new StoreInst(OperandsVal, + OperandsGEP, + false, + DL.getABITypeAlignment(OperandsTy), + InsertPos); + SOVI->setDebugLoc(DebugLoc); + } + + idx++; + } + + Instruction *NewHead = Marker->addComputeNextPtr(AllocaPtr, 0, InsertPos, + OtherCandidates.front().getCallInst()->getDebugLoc(), DL); + // Redirect list head by, adjusting listhead PHI Node in the loop entry. + ListHead->addIncoming(NewHead, + NewHead->getParent()); + + replaceAllocaWithFreeList(AllocaPtr, AllocaEnd, nullptr, FreeList); + } + + BasicBlock *insertReturnPath( + BasicBlock *MtceReturnBB, Instruction *ListHead, + Instruction *FreeList, PHINode *WorkListItemPHI, + PHINode *FreeListRecursivePHI) { + // If the block is a non-returning block, then the following block is the a + // return only block and the branch is an unconditional one. In this case + // the branch instruction is replaced by a clone of the return instruction + // of the successor BB. + if (BranchInst *BI = dyn_cast(MtceReturnBB->getTerminator())) { + BasicBlock *RetOnlyBB = MtceReturnBB->getSingleSuccessor(); + assert(RetOnlyBB && "Recursion has more than one successor"); + Instruction *NewTI = RetOnlyBB->getTerminator()->clone(); + ReplaceInstWithInst(BI, NewTI); + } + + const DebugLoc &DebugLoc = MtceReturnBB->getTerminator()->getDebugLoc(); + + Instruction *InsertPos; + BasicBlock *ReturningBB; + std::tie(InsertPos, ReturningBB) = Marker->insertReturnPath( + ListHead, FreeList, MtceReturnBB, WorkListItemPHI, + FreeListRecursivePHI, DebugLoc, F.getParent()->getDataLayout()); + + // If a basic block for freelist management was added, add the corresponding + // Instructions. + if (InsertPos) { + // Compute address of bulk allocation. + signed IdxOfFirstItem = -(signed)Marker->getItemIndex(OtherCandidatesNumOf - 1); + Value *IdxBulk[] = { + ConstantInt::get(DL.getIntPtrType(C, DL.getAllocaAddrSpace()), IdxOfFirstItem), + }; + + Instruction *FreeListAdded = GetElementPtrInst::CreateInBounds( + ListTy, ListHead, makeArrayRef(IdxBulk, 1), "mtce.freelist.added", InsertPos); + FreeListAdded->setDebugLoc(DebugLoc); + + // Adjust Freelist PHI + FreeListRecursivePHI->addIncoming(FreeListAdded, InsertPos->getParent()); + + // Store the old list head into the last element of the item. This is done + // after potentially splitting of the conditional branch, to avoid messing + // up with `NewFreeHead->replaceAllUsesWith(FreeNextPHI);`, we want to use + // this the original value here. + + // Compute the address of the last item in the block + Value *IdxNext[] = { + ConstantInt::get(DL.getIntPtrType(C, DL.getAllocaAddrSpace()), + Marker->getItemIndex(OtherCandidatesNumOf - 1)), + ConstantInt::get(IntegerType::get(C, 32), 1), + ConstantInt::get(IntegerType::get(C, 32), 0), + }; + Instruction *FreeNext = GetElementPtrInst::CreateInBounds(ListTy, + FreeListAdded, + makeArrayRef(IdxNext, 3), + "mtce.freenextptr", + &*InsertPos); + FreeNext->setDebugLoc(DebugLoc); + + StoreInst *SI = new StoreInst(FreeList, + FreeNext, + false, + DL.getABITypeAlignment(ListTy->getPointerTo()), + &*InsertPos); + SI->setDebugLoc(DebugLoc); + } + + return ReturningBB; + } + + template + std::tuple createRecursiveLoopEntry( + BasicBlock *LoopEntry, CandidateContainer &OtherCandidates, + PHINode *ListHead, PHINode *FreeList, + const ArgumentPHIContainer &ArgumentPHIs) { + RecursiveCall &CC = OtherCandidates.front(); + const DebugLoc &DebugLoc = CC.getCallInst()->getDebugLoc(); + + // Create the basic block for the loop entry, when using an item from the + // worklist. + BasicBlock *EntryBB = BasicBlock::Create(C, "mtce.loop.from.worklist", &F, LoopEntry); + + // Create a worklist item as PHINode that contains the referenced list item. + PHINode *WLItem = PHINode::Create(ListTy, 1, "mtce.worklist.item", EntryBB); + WLItem->setDebugLoc(DebugLoc); + PHINode *FreeListNext = PHINode::Create(ListTy->getPointerTo(), 1, "mtce.freelist.next", EntryBB); + FreeListNext->setDebugLoc(DebugLoc); + + unsigned Idx[2] = { 1, 0 }; + Instruction *WLNext = ExtractValueInst::Create( + WLItem, makeArrayRef(Idx, 2), "mtce.wlnext", EntryBB); + + unsigned IncomingCount = IncomingArguments.size(); + SmallVector Incoming; + Incoming.reserve(IncomingCount); + Idx[0] = 0; + for (IncomingArgument &IA : IncomingArguments) { + Value *V = IA.getValue(); + if (IA.isConstant() || IA.isStatic()) { + Incoming.push_back(V); + } + else { + Instruction *EVI = ExtractValueInst::Create( + WLItem, makeArrayRef(Idx, 2), "mtce.op." + Twine(Idx[1]), EntryBB); + EVI->setDebugLoc(DebugLoc); + Incoming.push_back(EVI); + Idx[1]++; + } + } + + // Clone instructions from function H back into the code sequence. + Function *FunctionH = CC.getFunctionH(); + assert(FunctionH && "FunctionH not present."); + + CallInst *CI = nullptr; + std::map VVMap; + for (Instruction &I: FunctionH->getEntryBlock()) { + if (&I == FunctionH->getEntryBlock().getTerminator()) + continue; + + Instruction *NewI = I.clone(); + VVMap[&I] = NewI; + EntryBB->getInstList().push_back(NewI); + CI = dyn_cast(NewI); + + for (Use &U: NewI->operands()) { + Value *V = U.get(); + + auto VVMI = VVMap.find(V); + + // For Argument we replace the operand with the corresponding item in Incoming. + if (Argument *A = dyn_cast(V)) { + U.set(Incoming[A->getArgNo()]); + } + // For Values within the Value-Value map use VVMap to resolve the relation. + else if (VVMI != VVMap.end()) { + U.set(VVMI->second); + } + // Otherwise it must be a constant that can be left unchanged. + else { + assert(isa(V) && "could not transform. Not a constant"); + } + } + } + assert(CI && CI->getCalledFunction() == &F && "Last instruction in FunctionH was not a recursive CallInst"); + + // Branch into loop. + BranchInst *BI = BranchInst::Create(LoopEntry, EntryBB); + BI->setDebugLoc(DebugLoc); + + // Adjust the List PHI Node in the loop entry. + ListHead->addIncoming(WLNext, EntryBB); + + // Adjust the List PHI Node in the loop entry. + FreeList->addIncoming(FreeListNext, EntryBB); + + // Adjust Argument PHI Nodes in the loop entry. + adjustPHIsFromCallInst(EntryBB, CI, ArgumentPHIs); + + // Finally erase the CallInst. + CI->eraseFromParent(); + + // Remove the call instructions, which got eliminated + for (auto &Cand: OtherCandidates) { + Cand.eraseSourceInstructions(); + } + + return std::make_tuple(WLItem, FreeListNext); + } + + /// Redirect all returning blocks into the the new return block. + void redirectRemainingReturnBlocks(BasicBlock *PullBB, BasicBlock *NewReturnBB) { + for (auto &BB: F) { + if (&BB == NewReturnBB) + continue; + + if (ReturnInst *RI = dyn_cast(BB.getTerminator())) { + BranchInst *BI = BranchInst::Create(PullBB, RI); + BI->setDebugLoc(RI->getDebugLoc()); + RI->eraseFromParent(); + } + } + } + + /// Call the transformation functions above to apply the transformation. + void eliminateTailCalls() { + SmallVector ArgumentPHIs; + PHINode *ListHead; + PHINode *FreeList; + BasicBlock *LoopEntry; + + std::tie(ListHead, FreeList, LoopEntry) = prepareEntry(ArgumentPHIs); + + BasicBlock *CallerBB, *MtceReturnBB; + std::tie(CallerBB, MtceReturnBB) = + eliminateFirstCall(FirstCandidate.front(), LoopEntry, ArgumentPHIs); + + eliminateOtherCalls( + OtherCandidates, + CallerBB, + ListHead, + FreeList); + + // Create a loop entry when executing an item from the worklist. This + // function will also remove the remaining static instructions. + PHINode *WorkListItemPHI; + PHINode *FreeListRecursivePHI; + std::tie(WorkListItemPHI, FreeListRecursivePHI) = + createRecursiveLoopEntry(LoopEntry, OtherCandidates, + ListHead, FreeList, + ArgumentPHIs); + + BasicBlock *ReturningBB = insertReturnPath(MtceReturnBB, ListHead, FreeList, WorkListItemPHI, FreeListRecursivePHI); + + redirectRemainingReturnBlocks(MtceReturnBB, ReturningBB); + + // What left is to simplify, the added PHINodes. As we do not use + // ArgumentPHIs anymore. This list is reused for the simplification loop + // below. + ArgumentPHIs.push_back(WorkListItemPHI); + ArgumentPHIs.push_back(FreeListRecursivePHI); + ArgumentPHIs.push_back(ListHead); + ArgumentPHIs.push_back(FreeList); + + for (PHINode *PN : ArgumentPHIs) { + // If the PHI Node is a dynamic constant, replace it with the value it is. + if (Value *PNV = SimplifyInstruction(PN, F.getParent()->getDataLayout())) { + PN->replaceAllUsesWith(PNV); + PN->eraseFromParent(); + } + } + + ORE.emit([&]() { + return OptimizationRemark(DEBUG_TYPE, "Multiple tail calls eliminated", + F.getEntryBlock().front().getDebugLoc(), &F.getEntryBlock()) + << "Calls modeled with: " << ore::NV("Width", OtherCandidatesNumOf + 1); + }); + } +}; + +/// Wrapper class for legacy pass manager. +struct MultiTailCallElimination : public FunctionPass { + static char ID; + MultiTailCallElimination() : FunctionPass(ID) { + initializeMultiTailCallEliminationPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + AU.addPreserved(); + AU.addRequired(); + } + + bool runOnFunction(Function &F) override { + if (DisableMultiTailCallElimination) + return false; + + if (skipFunction(F)) + return false; + + return MultiTailCallEliminationImpl( + F, + getAnalysis().getAAResults(), + getAnalysis().getTTI(F), + getAnalysis().getORE()).run(); + } +}; +} + +char MultiTailCallElimination::ID = 0; + +INITIALIZE_PASS_BEGIN(MultiTailCallElimination, DEBUG_TYPE, "Multi Tail Call Elimination", + false, false) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) +INITIALIZE_PASS_END(MultiTailCallElimination, DEBUG_TYPE, "Multi Tail Call Elimination", + false, false) + +/// Public interface to the MultiTailCallEliminationination pass. +FunctionPass *llvm::createMultiTailCallEliminationPass() { + return new MultiTailCallElimination(); +} + +/// Wrapper class for experimental pass manager. +PreservedAnalyses MultiTailCallEliminationPass::run(Function &F, + FunctionAnalysisManager &AM) { + if (DisableMultiTailCallElimination) + return PreservedAnalyses::all(); + + bool Changed = MultiTailCallEliminationImpl( + F, + AM.getResult(F), + AM.getResult(F), + AM.getResult(F)).run(); + + if (!Changed) + return PreservedAnalyses::all(); + + PreservedAnalyses PA; + PA.preserve(); + + return PA; +} Index: lib/Transforms/Scalar/Scalar.cpp =================================================================== --- lib/Transforms/Scalar/Scalar.cpp +++ lib/Transforms/Scalar/Scalar.cpp @@ -82,6 +82,7 @@ initializeMemCpyOptLegacyPassPass(Registry); initializeMergeICmpsPass(Registry); initializeMergedLoadStoreMotionLegacyPassPass(Registry); + initializeMultiTailCallEliminationPass(Registry); initializeNaryReassociateLegacyPassPass(Registry); initializePartiallyInlineLibCallsLegacyPassPass(Registry); initializeReassociateLegacyPassPass(Registry); @@ -237,6 +238,10 @@ unwrap(PM)->add(createTailCallEliminationPass()); } +void LLVMAddMultiTailCallEliminationinationPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createMultiTailCallEliminationPass()); +} + void LLVMAddConstantPropagationPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createConstantPropagationPass()); } Index: test/Transforms/MultiTailCallElimination/MultiTailCallElimination.ll =================================================================== --- /dev/null +++ test/Transforms/MultiTailCallElimination/MultiTailCallElimination.ll @@ -0,0 +1,374 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -multi-tce -S | FileCheck %s + +define void @simple2(i32 %a1, i32 %a2, i32 %a3) { +; CHECK-LABEL: @simple2( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[MTCE_LOOP:%.*]] +; CHECK: mtce.loop.from.worklist: +; CHECK-NEXT: [[MTCE_WLNEXT:%.*]] = extractvalue [[MTCE_LISTTY:%.*]] %mtce.mload, 1, 0 +; CHECK-NEXT: [[MTCE_OP_0:%.*]] = extractvalue [[MTCE_LISTTY]] %mtce.mload, 0, 0 +; CHECK-NEXT: br label [[MTCE_LOOP]] +; CHECK: mtce.loop: +; CHECK-NEXT: [[MTCE_LISTHEAD:%.*]] = phi %mtce.listty* [ null, [[ENTRY:%.*]] ], [ [[MTCE_NEXTPTR_FOR_0:%.*]], [[MTCE_LOOP_1:%.*]] ], [ [[MTCE_WLNEXT]], [[MTCE_LOOP_FROM_WORKLIST:%.*]] ] +; CHECK-NEXT: [[MTCE_FREELIST:%.*]] = phi %mtce.listty* [ null, [[ENTRY]] ], [ [[MTCE_FREELIST_NEXT:%.*]], [[MTCE_LOOP_1]] ], [ [[MTCE_FREELIST_ADDED:%.*]], [[MTCE_LOOP_FROM_WORKLIST]] ] +; CHECK-NEXT: [[MTCE_A1:%.*]] = phi i32 [ [[A1:%.*]], [[ENTRY]] ], [ 1, [[MTCE_LOOP_1]] ], [ [[MTCE_OP_0]], [[MTCE_LOOP_FROM_WORKLIST]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[MTCE_A1]], [[A2:%.*]] +; CHECK-NEXT: [[MTCE_CMP_FREELIST_EMPTY:%.*]] = icmp eq %mtce.listty* [[MTCE_FREELIST]], null +; CHECK-NEXT: br i1 [[MTCE_CMP_FREELIST_EMPTY]], label [[MTCE_ALLOCA:%.*]], label [[MTCE_ALLOCFROMFREE:%.*]] +; CHECK: mtce.alloca: +; CHECK-NEXT: [[MTCE_ITEMPTR:%.*]] = alloca [[MTCE_LISTTY]], i64 1, align 8 +; CHECK-NEXT: [[MTCE_LINKPTR_0:%.*]] = getelementptr inbounds [[MTCE_LISTTY]], %mtce.listty* [[MTCE_ITEMPTR]], i64 0, i32 1 +; CHECK-NEXT: br label [[MTCE_LOOP_1]] +; CHECK: mtce.allocfromfree: +; CHECK-NEXT: [[MTCE_PULLEDITEMGEP_0:%.*]] = getelementptr inbounds [[MTCE_LISTTY]], %mtce.listty* [[MTCE_FREELIST]], i64 0 +; CHECK-NEXT: [[MTCE_FREELIST_NEXTPTR:%.*]] = getelementptr inbounds [[MTCE_LISTTY]], %mtce.listty* [[MTCE_FREELIST]], i64 0, i32 1, i32 0 +; CHECK-NEXT: [[MTCE_NEWFREELIST:%.*]] = load %mtce.listty*, %mtce.listty** [[MTCE_FREELIST_NEXTPTR]], align 8 +; CHECK-NEXT: br label [[MTCE_LOOP_1]] +; CHECK: mtce.loop.1: +; CHECK-NEXT: [[MTCE_ALLOCATED:%.*]] = phi %mtce.listty* [ [[MTCE_ITEMPTR]], [[MTCE_ALLOCA]] ], [ [[MTCE_FREELIST]], [[MTCE_ALLOCFROMFREE]] ] +; CHECK-NEXT: [[MTCE_FREELIST_NEXT]] = phi %mtce.listty* [ [[MTCE_FREELIST]], [[MTCE_ALLOCA]] ], [ [[MTCE_NEWFREELIST]], [[MTCE_ALLOCFROMFREE]] ] +; CHECK-NEXT: [[MTCE_NEXTPTR_0:%.*]] = getelementptr inbounds [[MTCE_LISTTY]], %mtce.listty* [[MTCE_ALLOCATED]], i64 0, i32 1, i32 0 +; CHECK-NEXT: store %mtce.listty* [[MTCE_LISTHEAD]], %mtce.listty** [[MTCE_NEXTPTR_0]], align 8 +; CHECK-NEXT: [[MTCE_ARG_0_OP0:%.*]] = insertvalue { i32 } undef, i32 [[TMP0]], 0 +; CHECK-NEXT: [[MTCE_OPERANDSGEP_0:%.*]] = getelementptr inbounds [[MTCE_LISTTY]], %mtce.listty* [[MTCE_ALLOCATED]], i64 0, i32 0 +; CHECK-NEXT: store { i32 } [[MTCE_ARG_0_OP0]], { i32 }* [[MTCE_OPERANDSGEP_0]], align 4 +; CHECK-NEXT: [[MTCE_NEXTPTR_FOR_0]] = getelementptr inbounds [[MTCE_LISTTY]], %mtce.listty* [[MTCE_ALLOCATED]], i64 0 +; CHECK-NEXT: br label [[MTCE_LOOP]] +; CHECK: mtce.return: +; CHECK-NEXT: [[MTCE_WORKLIST_IS_NULL_TM:%.*]] = icmp eq %mtce.listty* [[MTCE_LISTHEAD]], null +; CHECK-NEXT: br i1 [[MTCE_WORKLIST_IS_NULL_TM]], label [[MTCE_RETONLY:%.*]], label [[MTCE_ADD_FREELIST:%.*]] +; CHECK: mtce.add.freelist: +; CHECK-NEXT: [[MTCE_MLOAD:%.*]] = load [[MTCE_LISTTY]], %mtce.listty* [[MTCE_LISTHEAD]], align 8 +; CHECK-NEXT: [[MTCE_FREELIST_ADDED]] = getelementptr inbounds [[MTCE_LISTTY]], %mtce.listty* [[MTCE_LISTHEAD]], i64 0 +; CHECK-NEXT: [[MTCE_FREENEXTPTR:%.*]] = getelementptr inbounds [[MTCE_LISTTY]], %mtce.listty* [[MTCE_FREELIST_ADDED]], i64 0, i32 1, i32 0 +; CHECK-NEXT: store %mtce.listty* [[MTCE_FREELIST]], %mtce.listty** [[MTCE_FREENEXTPTR]], align 8 +; CHECK-NEXT: br label [[MTCE_LOOP_FROM_WORKLIST]] +; CHECK: mtce.retonly: +; CHECK-NEXT: ret void +; +entry: + %0 = add i32 %a1, %a2 + call void @simple2(i32 1, i32 %a2, i32 %a3) + call void @simple2(i32 %0, i32 %a2, i32 %a3) + ret void +} + +define void @simple4_with_const(i32 %a1, i32 %a2, i32 %a3) { +; CHECK-LABEL: @simple4_with_const( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[MTCE_LOOP:%.*]] +; CHECK: mtce.loop.from.worklist: +; CHECK-NEXT: [[MTCE_WORKLIST_ITEM:%.*]] = phi [[MTCE_LISTTY_0:%.*]] [ [[MTCE_UMLOAD:%.*]], [[MTCE_UNMARKED:%.*]] ], [ [[MTCE_MLOAD:%.*]], [[MTCE_ADD_FREELIST:%.*]] ] +; CHECK-NEXT: [[MTCE_FREELIST_NEXT2:%.*]] = phi %mtce.listty.0* [ [[MTCE_FREELIST:%.*]], [[MTCE_UNMARKED]] ], [ [[MTCE_FREELIST_ADDED:%.*]], [[MTCE_ADD_FREELIST]] ] +; CHECK-NEXT: [[MTCE_WLNEXT:%.*]] = extractvalue [[MTCE_LISTTY_0]] %mtce.worklist.item, 1, 0 +; CHECK-NEXT: [[MTCE_OP_0:%.*]] = extractvalue [[MTCE_LISTTY_0]] %mtce.worklist.item, 0, 0 +; CHECK-NEXT: br label [[MTCE_LOOP]] +; CHECK: mtce.loop: +; CHECK-NEXT: [[MTCE_LISTHEAD:%.*]] = phi %mtce.listty.0* [ null, [[ENTRY:%.*]] ], [ [[MTCE_TAGGED_NEXT_FOR_0:%.*]], [[MTCE_LOOP_1:%.*]] ], [ [[MTCE_WLNEXT]], [[MTCE_LOOP_FROM_WORKLIST:%.*]] ] +; CHECK-NEXT: [[MTCE_FREELIST]] = phi %mtce.listty.0* [ null, [[ENTRY]] ], [ [[MTCE_FREELIST_NEXT:%.*]], [[MTCE_LOOP_1]] ], [ [[MTCE_FREELIST_NEXT2]], [[MTCE_LOOP_FROM_WORKLIST]] ] +; CHECK-NEXT: [[MTCE_A1:%.*]] = phi i32 [ [[A1:%.*]], [[ENTRY]] ], [ 1, [[MTCE_LOOP_1]] ], [ [[MTCE_OP_0]], [[MTCE_LOOP_FROM_WORKLIST]] ] +; CHECK-NEXT: [[MTCE_CMP_FREELIST_EMPTY:%.*]] = icmp eq %mtce.listty.0* [[MTCE_FREELIST]], null +; CHECK-NEXT: br i1 [[MTCE_CMP_FREELIST_EMPTY]], label [[MTCE_ALLOCA:%.*]], label [[MTCE_ALLOCFROMFREE:%.*]] +; CHECK: mtce.alloca: +; CHECK-NEXT: [[MTCE_ITEMPTR:%.*]] = alloca [[MTCE_LISTTY_0]], i64 3, align 8 +; CHECK-NEXT: [[MTCE_LINKPTR_0:%.*]] = getelementptr inbounds [[MTCE_LISTTY_0]], %mtce.listty.0* [[MTCE_ITEMPTR]], i64 0, i32 1 +; CHECK-NEXT: [[MTCE_NEXTPTR_0:%.*]] = getelementptr inbounds [[MTCE_LISTTY_0]], %mtce.listty.0* [[MTCE_ITEMPTR]], i64 0, i32 1, i32 0 +; CHECK-NEXT: [[MTCE_NEXTPTR_FOR_1:%.*]] = getelementptr inbounds [[MTCE_LISTTY_0]], %mtce.listty.0* [[MTCE_ITEMPTR]], i64 1 +; CHECK-NEXT: [[MTCE_PTOI_1:%.*]] = ptrtoint %mtce.listty.0* [[MTCE_NEXTPTR_FOR_1]] to i64 +; CHECK-NEXT: [[MTCE_LISTHEAD_MSKED_INTPTR:%.*]] = or i64 [[MTCE_PTOI_1]], 1 +; CHECK-NEXT: [[MTCE_TAGGED_NEXT_FOR_1:%.*]] = inttoptr i64 [[MTCE_LISTHEAD_MSKED_INTPTR]] to %mtce.listty.0* +; CHECK-NEXT: store %mtce.listty.0* [[MTCE_TAGGED_NEXT_FOR_1]], %mtce.listty.0** [[MTCE_NEXTPTR_0]], align 8 +; CHECK-NEXT: [[MTCE_LINKPTR_1:%.*]] = getelementptr inbounds [[MTCE_LISTTY_0]], %mtce.listty.0* [[MTCE_ITEMPTR]], i64 1, i32 1 +; CHECK-NEXT: [[MTCE_NEXTPTR_1:%.*]] = getelementptr inbounds [[MTCE_LISTTY_0]], %mtce.listty.0* [[MTCE_ITEMPTR]], i64 1, i32 1, i32 0 +; CHECK-NEXT: [[MTCE_NEXTPTR_FOR_2:%.*]] = getelementptr inbounds [[MTCE_LISTTY_0]], %mtce.listty.0* [[MTCE_ITEMPTR]], i64 2 +; CHECK-NEXT: store %mtce.listty.0* [[MTCE_NEXTPTR_FOR_2]], %mtce.listty.0** [[MTCE_NEXTPTR_1]], align 8 +; CHECK-NEXT: [[MTCE_LINKPTR_2:%.*]] = getelementptr inbounds [[MTCE_LISTTY_0]], %mtce.listty.0* [[MTCE_ITEMPTR]], i64 2, i32 1 +; CHECK-NEXT: br label [[MTCE_LOOP_1]] +; CHECK: mtce.allocfromfree: +; CHECK-NEXT: [[MTCE_PULLEDITEMGEP_0:%.*]] = getelementptr inbounds [[MTCE_LISTTY_0]], %mtce.listty.0* [[MTCE_FREELIST]], i64 0 +; CHECK-NEXT: [[MTCE_FREELIST_NEXTPTR:%.*]] = getelementptr inbounds [[MTCE_LISTTY_0]], %mtce.listty.0* [[MTCE_FREELIST]], i64 2, i32 1, i32 0 +; CHECK-NEXT: [[MTCE_NEWFREELIST:%.*]] = load %mtce.listty.0*, %mtce.listty.0** [[MTCE_FREELIST_NEXTPTR]], align 8 +; CHECK-NEXT: br label [[MTCE_LOOP_1]] +; CHECK: mtce.loop.1: +; CHECK-NEXT: [[MTCE_ALLOCATED:%.*]] = phi %mtce.listty.0* [ [[MTCE_ITEMPTR]], [[MTCE_ALLOCA]] ], [ [[MTCE_FREELIST]], [[MTCE_ALLOCFROMFREE]] ] +; CHECK-NEXT: [[MTCE_FREELIST_NEXT]] = phi %mtce.listty.0* [ [[MTCE_FREELIST]], [[MTCE_ALLOCA]] ], [ [[MTCE_NEWFREELIST]], [[MTCE_ALLOCFROMFREE]] ] +; CHECK-NEXT: [[MTCE_NEXTPTR_2:%.*]] = getelementptr inbounds [[MTCE_LISTTY_0]], %mtce.listty.0* [[MTCE_ALLOCATED]], i64 2, i32 1, i32 0 +; CHECK-NEXT: store %mtce.listty.0* [[MTCE_LISTHEAD]], %mtce.listty.0** [[MTCE_NEXTPTR_2]], align 8 +; CHECK-NEXT: [[MTCE_ARG_0_OP0:%.*]] = insertvalue { i32 } undef, i32 2, 0 +; CHECK-NEXT: [[MTCE_OPERANDSGEP_0:%.*]] = getelementptr inbounds [[MTCE_LISTTY_0]], %mtce.listty.0* [[MTCE_ALLOCATED]], i64 0, i32 0 +; CHECK-NEXT: store { i32 } [[MTCE_ARG_0_OP0]], { i32 }* [[MTCE_OPERANDSGEP_0]], align 4 +; CHECK-NEXT: [[MTCE_ARG_1_OP0:%.*]] = insertvalue { i32 } undef, i32 3, 0 +; CHECK-NEXT: [[MTCE_OPERANDSGEP_1:%.*]] = getelementptr inbounds [[MTCE_LISTTY_0]], %mtce.listty.0* [[MTCE_ALLOCATED]], i64 1, i32 0 +; CHECK-NEXT: store { i32 } [[MTCE_ARG_1_OP0]], { i32 }* [[MTCE_OPERANDSGEP_1]], align 4 +; CHECK-NEXT: [[MTCE_ARG_2_OP0:%.*]] = insertvalue { i32 } undef, i32 4, 0 +; CHECK-NEXT: [[MTCE_OPERANDSGEP_2:%.*]] = getelementptr inbounds [[MTCE_LISTTY_0]], %mtce.listty.0* [[MTCE_ALLOCATED]], i64 2, i32 0 +; CHECK-NEXT: store { i32 } [[MTCE_ARG_2_OP0]], { i32 }* [[MTCE_OPERANDSGEP_2]], align 4 +; CHECK-NEXT: [[MTCE_NEXTPTR_FOR_0:%.*]] = getelementptr inbounds [[MTCE_LISTTY_0]], %mtce.listty.0* [[MTCE_ALLOCATED]], i64 0 +; CHECK-NEXT: [[MTCE_PTOI_0:%.*]] = ptrtoint %mtce.listty.0* [[MTCE_NEXTPTR_FOR_0]] to i64 +; CHECK-NEXT: [[MTCE_LISTHEAD_MSKED_INTPTR1:%.*]] = or i64 [[MTCE_PTOI_0]], 1 +; CHECK-NEXT: [[MTCE_TAGGED_NEXT_FOR_0]] = inttoptr i64 [[MTCE_LISTHEAD_MSKED_INTPTR1]] to %mtce.listty.0* +; CHECK-NEXT: br label [[MTCE_LOOP]] +; CHECK: mtce.return: +; CHECK-NEXT: [[MTCE_PTOI:%.*]] = ptrtoint %mtce.listty.0* [[MTCE_LISTHEAD]] to i64 +; CHECK-NEXT: [[MTCE_MARK:%.*]] = trunc i64 [[MTCE_PTOI]] to i1 +; CHECK-NEXT: [[MTCE_MASKED_WORKLIST_INTPTR:%.*]] = and i64 [[MTCE_PTOI]], -2 +; CHECK-NEXT: [[MTCE_MASKED_WORKLIST:%.*]] = inttoptr i64 [[MTCE_MASKED_WORKLIST_INTPTR]] to %mtce.listty.0* +; CHECK-NEXT: br i1 [[MTCE_MARK]], label [[MTCE_UNMARKED]], label [[MTCE_CHECK_RETURN:%.*]] +; CHECK: mtce.unmarked: +; CHECK-NEXT: [[MTCE_UMLOAD]] = load [[MTCE_LISTTY_0]], %mtce.listty.0* [[MTCE_MASKED_WORKLIST]], align 8 +; CHECK-NEXT: br label [[MTCE_LOOP_FROM_WORKLIST]] +; CHECK: mtce.check.return: +; CHECK-NEXT: [[MTCE_WORKLIST_IS_NULL_TG:%.*]] = icmp eq %mtce.listty.0* [[MTCE_LISTHEAD]], null +; CHECK-NEXT: br i1 [[MTCE_WORKLIST_IS_NULL_TG]], label [[MTCE_RETONLY:%.*]], label [[MTCE_ADD_FREELIST]] +; CHECK: mtce.add.freelist: +; CHECK-NEXT: [[MTCE_MLOAD]] = load [[MTCE_LISTTY_0]], %mtce.listty.0* [[MTCE_LISTHEAD]], align 8 +; CHECK-NEXT: [[MTCE_FREELIST_ADDED]] = getelementptr inbounds [[MTCE_LISTTY_0]], %mtce.listty.0* [[MTCE_LISTHEAD]], i64 -2 +; CHECK-NEXT: [[MTCE_FREENEXTPTR:%.*]] = getelementptr inbounds [[MTCE_LISTTY_0]], %mtce.listty.0* [[MTCE_FREELIST_ADDED]], i64 2, i32 1, i32 0 +; CHECK-NEXT: store %mtce.listty.0* [[MTCE_FREELIST]], %mtce.listty.0** [[MTCE_FREENEXTPTR]], align 8 +; CHECK-NEXT: br label [[MTCE_LOOP_FROM_WORKLIST]] +; CHECK: mtce.retonly: +; CHECK-NEXT: ret void +; +entry: + call void @simple4_with_const(i32 1, i32 %a2, i32 %a3) + call void @simple4_with_const(i32 2, i32 %a2, i32 %a3) + call void @simple4_with_const(i32 3, i32 %a2, i32 %a3) + call void @simple4_with_const(i32 4, i32 %a2, i32 %a3) + ret void +} + +; Traverse a full quad-tree where the load is executed first +%quadtree_ty = type { [ 4 x %quadtree_ty* ] } +define void @quadtree(%quadtree_ty* %this, i32 %a2, i32 %a3) { +; CHECK-LABEL: @quadtree( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[MTCE_LOOP:%.*]] +; CHECK: mtce.loop.from.worklist: +; CHECK-NEXT: [[MTCE_WORKLIST_ITEM:%.*]] = phi [[MTCE_LISTTY_1:%.*]] [ [[MTCE_UMLOAD:%.*]], [[MTCE_UNMARKED:%.*]] ], [ [[MTCE_MLOAD:%.*]], [[MTCE_ADD_FREELIST:%.*]] ] +; CHECK-NEXT: [[MTCE_FREELIST_NEXT2:%.*]] = phi %mtce.listty.1* [ [[MTCE_FREELIST:%.*]], [[MTCE_UNMARKED]] ], [ [[MTCE_FREELIST_ADDED:%.*]], [[MTCE_ADD_FREELIST]] ] +; CHECK-NEXT: [[MTCE_WLNEXT:%.*]] = extractvalue [[MTCE_LISTTY_1]] %mtce.worklist.item, 1, 0 +; CHECK-NEXT: [[MTCE_OP_0:%.*]] = extractvalue [[MTCE_LISTTY_1]] %mtce.worklist.item, 0, 0 +; CHECK-NEXT: br label [[MTCE_LOOP]] +; CHECK: mtce.loop: +; CHECK-NEXT: [[MTCE_LISTHEAD:%.*]] = phi %mtce.listty.1* [ null, [[ENTRY:%.*]] ], [ [[MTCE_TAGGED_NEXT_FOR_0:%.*]], [[MTCE_LOOP_1:%.*]] ], [ [[MTCE_WLNEXT]], [[MTCE_LOOP_FROM_WORKLIST:%.*]] ] +; CHECK-NEXT: [[MTCE_FREELIST]] = phi %mtce.listty.1* [ null, [[ENTRY]] ], [ [[MTCE_FREELIST_NEXT:%.*]], [[MTCE_LOOP_1]] ], [ [[MTCE_FREELIST_NEXT2]], [[MTCE_LOOP_FROM_WORKLIST]] ] +; CHECK-NEXT: [[MTCE_THIS:%.*]] = phi %quadtree_ty* [ [[THIS:%.*]], [[ENTRY]] ], [ [[C0:%.*]], [[MTCE_LOOP_1]] ], [ [[MTCE_OP_0]], [[MTCE_LOOP_FROM_WORKLIST]] ] +; CHECK-NEXT: [[C0PTR:%.*]] = getelementptr inbounds [[QUADTREE_TY:%.*]], %quadtree_ty* [[MTCE_THIS]], i64 0, i32 0, i32 0 +; CHECK-NEXT: [[C0]] = load %quadtree_ty*, %quadtree_ty** [[C0PTR]] +; CHECK-NEXT: [[C1PTR:%.*]] = getelementptr inbounds [[QUADTREE_TY]], %quadtree_ty* [[MTCE_THIS]], i64 0, i32 0, i32 1 +; CHECK-NEXT: [[C1:%.*]] = load %quadtree_ty*, %quadtree_ty** [[C1PTR]] +; CHECK-NEXT: [[C2PTR:%.*]] = getelementptr inbounds [[QUADTREE_TY]], %quadtree_ty* [[MTCE_THIS]], i64 0, i32 0, i32 2 +; CHECK-NEXT: [[C2:%.*]] = load %quadtree_ty*, %quadtree_ty** [[C2PTR]] +; CHECK-NEXT: [[C3PTR:%.*]] = getelementptr inbounds [[QUADTREE_TY]], %quadtree_ty* [[MTCE_THIS]], i64 0, i32 0, i32 3 +; CHECK-NEXT: [[C3:%.*]] = load %quadtree_ty*, %quadtree_ty** [[C3PTR]] +; CHECK-NEXT: [[MTCE_CMP_FREELIST_EMPTY:%.*]] = icmp eq %mtce.listty.1* [[MTCE_FREELIST]], null +; CHECK-NEXT: br i1 [[MTCE_CMP_FREELIST_EMPTY]], label [[MTCE_ALLOCA:%.*]], label [[MTCE_ALLOCFROMFREE:%.*]] +; CHECK: mtce.alloca: +; CHECK-NEXT: [[MTCE_ITEMPTR:%.*]] = alloca [[MTCE_LISTTY_1]], i64 3, align 8 +; CHECK-NEXT: [[MTCE_LINKPTR_0:%.*]] = getelementptr inbounds [[MTCE_LISTTY_1]], %mtce.listty.1* [[MTCE_ITEMPTR]], i64 0, i32 1 +; CHECK-NEXT: [[MTCE_NEXTPTR_0:%.*]] = getelementptr inbounds [[MTCE_LISTTY_1]], %mtce.listty.1* [[MTCE_ITEMPTR]], i64 0, i32 1, i32 0 +; CHECK-NEXT: [[MTCE_NEXTPTR_FOR_1:%.*]] = getelementptr inbounds [[MTCE_LISTTY_1]], %mtce.listty.1* [[MTCE_ITEMPTR]], i64 1 +; CHECK-NEXT: [[MTCE_PTOI_1:%.*]] = ptrtoint %mtce.listty.1* [[MTCE_NEXTPTR_FOR_1]] to i64 +; CHECK-NEXT: [[MTCE_LISTHEAD_MSKED_INTPTR:%.*]] = or i64 [[MTCE_PTOI_1]], 1 +; CHECK-NEXT: [[MTCE_TAGGED_NEXT_FOR_1:%.*]] = inttoptr i64 [[MTCE_LISTHEAD_MSKED_INTPTR]] to %mtce.listty.1* +; CHECK-NEXT: store %mtce.listty.1* [[MTCE_TAGGED_NEXT_FOR_1]], %mtce.listty.1** [[MTCE_NEXTPTR_0]], align 8 +; CHECK-NEXT: [[MTCE_LINKPTR_1:%.*]] = getelementptr inbounds [[MTCE_LISTTY_1]], %mtce.listty.1* [[MTCE_ITEMPTR]], i64 1, i32 1 +; CHECK-NEXT: [[MTCE_NEXTPTR_1:%.*]] = getelementptr inbounds [[MTCE_LISTTY_1]], %mtce.listty.1* [[MTCE_ITEMPTR]], i64 1, i32 1, i32 0 +; CHECK-NEXT: [[MTCE_NEXTPTR_FOR_2:%.*]] = getelementptr inbounds [[MTCE_LISTTY_1]], %mtce.listty.1* [[MTCE_ITEMPTR]], i64 2 +; CHECK-NEXT: store %mtce.listty.1* [[MTCE_NEXTPTR_FOR_2]], %mtce.listty.1** [[MTCE_NEXTPTR_1]], align 8 +; CHECK-NEXT: [[MTCE_LINKPTR_2:%.*]] = getelementptr inbounds [[MTCE_LISTTY_1]], %mtce.listty.1* [[MTCE_ITEMPTR]], i64 2, i32 1 +; CHECK-NEXT: br label [[MTCE_LOOP_1]] +; CHECK: mtce.allocfromfree: +; CHECK-NEXT: [[MTCE_PULLEDITEMGEP_0:%.*]] = getelementptr inbounds [[MTCE_LISTTY_1]], %mtce.listty.1* [[MTCE_FREELIST]], i64 0 +; CHECK-NEXT: [[MTCE_FREELIST_NEXTPTR:%.*]] = getelementptr inbounds [[MTCE_LISTTY_1]], %mtce.listty.1* [[MTCE_FREELIST]], i64 2, i32 1, i32 0 +; CHECK-NEXT: [[MTCE_NEWFREELIST:%.*]] = load %mtce.listty.1*, %mtce.listty.1** [[MTCE_FREELIST_NEXTPTR]], align 8 +; CHECK-NEXT: br label [[MTCE_LOOP_1]] +; CHECK: mtce.loop.1: +; CHECK-NEXT: [[MTCE_ALLOCATED:%.*]] = phi %mtce.listty.1* [ [[MTCE_ITEMPTR]], [[MTCE_ALLOCA]] ], [ [[MTCE_FREELIST]], [[MTCE_ALLOCFROMFREE]] ] +; CHECK-NEXT: [[MTCE_FREELIST_NEXT]] = phi %mtce.listty.1* [ [[MTCE_FREELIST]], [[MTCE_ALLOCA]] ], [ [[MTCE_NEWFREELIST]], [[MTCE_ALLOCFROMFREE]] ] +; CHECK-NEXT: [[MTCE_NEXTPTR_2:%.*]] = getelementptr inbounds [[MTCE_LISTTY_1]], %mtce.listty.1* [[MTCE_ALLOCATED]], i64 2, i32 1, i32 0 +; CHECK-NEXT: store %mtce.listty.1* [[MTCE_LISTHEAD]], %mtce.listty.1** [[MTCE_NEXTPTR_2]], align 8 +; CHECK-NEXT: [[MTCE_ARG_0_OP0:%.*]] = insertvalue { %quadtree_ty* } undef, %quadtree_ty* [[C1]], 0 +; CHECK-NEXT: [[MTCE_OPERANDSGEP_0:%.*]] = getelementptr inbounds [[MTCE_LISTTY_1]], %mtce.listty.1* [[MTCE_ALLOCATED]], i64 0, i32 0 +; CHECK-NEXT: store { %quadtree_ty* } [[MTCE_ARG_0_OP0]], { %quadtree_ty* }* [[MTCE_OPERANDSGEP_0]], align 8 +; CHECK-NEXT: [[MTCE_ARG_1_OP0:%.*]] = insertvalue { %quadtree_ty* } undef, %quadtree_ty* [[C2]], 0 +; CHECK-NEXT: [[MTCE_OPERANDSGEP_1:%.*]] = getelementptr inbounds [[MTCE_LISTTY_1]], %mtce.listty.1* [[MTCE_ALLOCATED]], i64 1, i32 0 +; CHECK-NEXT: store { %quadtree_ty* } [[MTCE_ARG_1_OP0]], { %quadtree_ty* }* [[MTCE_OPERANDSGEP_1]], align 8 +; CHECK-NEXT: [[MTCE_ARG_2_OP0:%.*]] = insertvalue { %quadtree_ty* } undef, %quadtree_ty* [[C3]], 0 +; CHECK-NEXT: [[MTCE_OPERANDSGEP_2:%.*]] = getelementptr inbounds [[MTCE_LISTTY_1]], %mtce.listty.1* [[MTCE_ALLOCATED]], i64 2, i32 0 +; CHECK-NEXT: store { %quadtree_ty* } [[MTCE_ARG_2_OP0]], { %quadtree_ty* }* [[MTCE_OPERANDSGEP_2]], align 8 +; CHECK-NEXT: [[MTCE_NEXTPTR_FOR_0:%.*]] = getelementptr inbounds [[MTCE_LISTTY_1]], %mtce.listty.1* [[MTCE_ALLOCATED]], i64 0 +; CHECK-NEXT: [[MTCE_PTOI_0:%.*]] = ptrtoint %mtce.listty.1* [[MTCE_NEXTPTR_FOR_0]] to i64 +; CHECK-NEXT: [[MTCE_LISTHEAD_MSKED_INTPTR1:%.*]] = or i64 [[MTCE_PTOI_0]], 1 +; CHECK-NEXT: [[MTCE_TAGGED_NEXT_FOR_0]] = inttoptr i64 [[MTCE_LISTHEAD_MSKED_INTPTR1]] to %mtce.listty.1* +; CHECK-NEXT: br label [[MTCE_LOOP]] +; CHECK: mtce.return: +; CHECK-NEXT: [[MTCE_PTOI:%.*]] = ptrtoint %mtce.listty.1* [[MTCE_LISTHEAD]] to i64 +; CHECK-NEXT: [[MTCE_MARK:%.*]] = trunc i64 [[MTCE_PTOI]] to i1 +; CHECK-NEXT: [[MTCE_MASKED_WORKLIST_INTPTR:%.*]] = and i64 [[MTCE_PTOI]], -2 +; CHECK-NEXT: [[MTCE_MASKED_WORKLIST:%.*]] = inttoptr i64 [[MTCE_MASKED_WORKLIST_INTPTR]] to %mtce.listty.1* +; CHECK-NEXT: br i1 [[MTCE_MARK]], label [[MTCE_UNMARKED]], label [[MTCE_CHECK_RETURN:%.*]] +; CHECK: mtce.unmarked: +; CHECK-NEXT: [[MTCE_UMLOAD]] = load [[MTCE_LISTTY_1]], %mtce.listty.1* [[MTCE_MASKED_WORKLIST]], align 8 +; CHECK-NEXT: br label [[MTCE_LOOP_FROM_WORKLIST]] +; CHECK: mtce.check.return: +; CHECK-NEXT: [[MTCE_WORKLIST_IS_NULL_TG:%.*]] = icmp eq %mtce.listty.1* [[MTCE_LISTHEAD]], null +; CHECK-NEXT: br i1 [[MTCE_WORKLIST_IS_NULL_TG]], label [[MTCE_RETONLY:%.*]], label [[MTCE_ADD_FREELIST]] +; CHECK: mtce.add.freelist: +; CHECK-NEXT: [[MTCE_MLOAD]] = load [[MTCE_LISTTY_1]], %mtce.listty.1* [[MTCE_LISTHEAD]], align 8 +; CHECK-NEXT: [[MTCE_FREELIST_ADDED]] = getelementptr inbounds [[MTCE_LISTTY_1]], %mtce.listty.1* [[MTCE_LISTHEAD]], i64 -2 +; CHECK-NEXT: [[MTCE_FREENEXTPTR:%.*]] = getelementptr inbounds [[MTCE_LISTTY_1]], %mtce.listty.1* [[MTCE_FREELIST_ADDED]], i64 2, i32 1, i32 0 +; CHECK-NEXT: store %mtce.listty.1* [[MTCE_FREELIST]], %mtce.listty.1** [[MTCE_FREENEXTPTR]], align 8 +; CHECK-NEXT: br label [[MTCE_LOOP_FROM_WORKLIST]] +; CHECK: mtce.retonly: +; CHECK-NEXT: ret void +; +entry: + %c0ptr = getelementptr inbounds %quadtree_ty, %quadtree_ty* %this, i64 0, i32 0, i32 0 + %c0 = load %quadtree_ty*, %quadtree_ty** %c0ptr + %c1ptr = getelementptr inbounds %quadtree_ty, %quadtree_ty* %this, i64 0, i32 0, i32 1 + %c1 = load %quadtree_ty*, %quadtree_ty** %c1ptr + %c2ptr = getelementptr inbounds %quadtree_ty, %quadtree_ty* %this, i64 0, i32 0, i32 2 + %c2 = load %quadtree_ty*, %quadtree_ty** %c2ptr + %c3ptr = getelementptr inbounds %quadtree_ty, %quadtree_ty* %this, i64 0, i32 0, i32 3 + %c3 = load %quadtree_ty*, %quadtree_ty** %c3ptr + call void @quadtree(%quadtree_ty* %c0, i32 %a2, i32 %a3) + call void @quadtree(%quadtree_ty* %c1, i32 %a2, i32 %a3) + call void @quadtree(%quadtree_ty* %c2, i32 %a2, i32 %a3) + call void @quadtree(%quadtree_ty* %c3, i32 %a2, i32 %a3) + ret void +} + +; Traverse a full quad-tree where the load is executed as part of function H +define void @quadtree_H(%quadtree_ty* %this, i32 %a2, i32 %a3) { +; CHECK-LABEL: @quadtree_H( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[MTCE_LOOP:%.*]] +; CHECK: mtce.loop.from.worklist: +; CHECK-NEXT: [[MTCE_WORKLIST_ITEM:%.*]] = phi [[MTCE_LISTTY_2:%.*]] [ [[MTCE_UMLOAD:%.*]], [[MTCE_UNMARKED:%.*]] ], [ [[MTCE_MLOAD:%.*]], [[MTCE_ADD_FREELIST:%.*]] ] +; CHECK-NEXT: [[MTCE_FREELIST_NEXT2:%.*]] = phi %mtce.listty.2* [ [[MTCE_FREELIST:%.*]], [[MTCE_UNMARKED]] ], [ [[MTCE_FREELIST_ADDED:%.*]], [[MTCE_ADD_FREELIST]] ] +; CHECK-NEXT: [[MTCE_WLNEXT:%.*]] = extractvalue [[MTCE_LISTTY_2]] %mtce.worklist.item, 1, 0 +; CHECK-NEXT: [[MTCE_OP_0:%.*]] = extractvalue [[MTCE_LISTTY_2]] %mtce.worklist.item, 0, 0 +; CHECK-NEXT: [[TMP0:%.*]] = load %quadtree_ty*, %quadtree_ty** [[MTCE_OP_0]] +; CHECK-NEXT: br label [[MTCE_LOOP]] +; CHECK: mtce.loop: +; CHECK-NEXT: [[MTCE_LISTHEAD:%.*]] = phi %mtce.listty.2* [ null, [[ENTRY:%.*]] ], [ [[MTCE_TAGGED_NEXT_FOR_0:%.*]], [[MTCE_LOOP_1:%.*]] ], [ [[MTCE_WLNEXT]], [[MTCE_LOOP_FROM_WORKLIST:%.*]] ] +; CHECK-NEXT: [[MTCE_FREELIST]] = phi %mtce.listty.2* [ null, [[ENTRY]] ], [ [[MTCE_FREELIST_NEXT:%.*]], [[MTCE_LOOP_1]] ], [ [[MTCE_FREELIST_NEXT2]], [[MTCE_LOOP_FROM_WORKLIST]] ] +; CHECK-NEXT: [[MTCE_THIS:%.*]] = phi %quadtree_ty* [ [[THIS:%.*]], [[ENTRY]] ], [ [[C0:%.*]], [[MTCE_LOOP_1]] ], [ [[TMP0]], [[MTCE_LOOP_FROM_WORKLIST]] ] +; CHECK-NEXT: [[C0PTR:%.*]] = getelementptr inbounds [[QUADTREE_TY:%.*]], %quadtree_ty* [[MTCE_THIS]], i64 0, i32 0, i32 0 +; CHECK-NEXT: [[C0]] = load %quadtree_ty*, %quadtree_ty** [[C0PTR]] +; CHECK-NEXT: [[C1PTR:%.*]] = getelementptr inbounds [[QUADTREE_TY]], %quadtree_ty* [[MTCE_THIS]], i64 0, i32 0, i32 1 +; CHECK-NEXT: [[C2PTR:%.*]] = getelementptr inbounds [[QUADTREE_TY]], %quadtree_ty* [[MTCE_THIS]], i64 0, i32 0, i32 2 +; CHECK-NEXT: [[C3PTR:%.*]] = getelementptr inbounds [[QUADTREE_TY]], %quadtree_ty* [[MTCE_THIS]], i64 0, i32 0, i32 3 +; CHECK-NEXT: [[MTCE_CMP_FREELIST_EMPTY:%.*]] = icmp eq %mtce.listty.2* [[MTCE_FREELIST]], null +; CHECK-NEXT: br i1 [[MTCE_CMP_FREELIST_EMPTY]], label [[MTCE_ALLOCA:%.*]], label [[MTCE_ALLOCFROMFREE:%.*]] +; CHECK: mtce.alloca: +; CHECK-NEXT: [[MTCE_ITEMPTR:%.*]] = alloca [[MTCE_LISTTY_2]], i64 3, align 8 +; CHECK-NEXT: [[MTCE_LINKPTR_0:%.*]] = getelementptr inbounds [[MTCE_LISTTY_2]], %mtce.listty.2* [[MTCE_ITEMPTR]], i64 0, i32 1 +; CHECK-NEXT: [[MTCE_NEXTPTR_0:%.*]] = getelementptr inbounds [[MTCE_LISTTY_2]], %mtce.listty.2* [[MTCE_ITEMPTR]], i64 0, i32 1, i32 0 +; CHECK-NEXT: [[MTCE_NEXTPTR_FOR_1:%.*]] = getelementptr inbounds [[MTCE_LISTTY_2]], %mtce.listty.2* [[MTCE_ITEMPTR]], i64 1 +; CHECK-NEXT: [[MTCE_PTOI_1:%.*]] = ptrtoint %mtce.listty.2* [[MTCE_NEXTPTR_FOR_1]] to i64 +; CHECK-NEXT: [[MTCE_LISTHEAD_MSKED_INTPTR:%.*]] = or i64 [[MTCE_PTOI_1]], 1 +; CHECK-NEXT: [[MTCE_TAGGED_NEXT_FOR_1:%.*]] = inttoptr i64 [[MTCE_LISTHEAD_MSKED_INTPTR]] to %mtce.listty.2* +; CHECK-NEXT: store %mtce.listty.2* [[MTCE_TAGGED_NEXT_FOR_1]], %mtce.listty.2** [[MTCE_NEXTPTR_0]], align 8 +; CHECK-NEXT: [[MTCE_LINKPTR_1:%.*]] = getelementptr inbounds [[MTCE_LISTTY_2]], %mtce.listty.2* [[MTCE_ITEMPTR]], i64 1, i32 1 +; CHECK-NEXT: [[MTCE_NEXTPTR_1:%.*]] = getelementptr inbounds [[MTCE_LISTTY_2]], %mtce.listty.2* [[MTCE_ITEMPTR]], i64 1, i32 1, i32 0 +; CHECK-NEXT: [[MTCE_NEXTPTR_FOR_2:%.*]] = getelementptr inbounds [[MTCE_LISTTY_2]], %mtce.listty.2* [[MTCE_ITEMPTR]], i64 2 +; CHECK-NEXT: store %mtce.listty.2* [[MTCE_NEXTPTR_FOR_2]], %mtce.listty.2** [[MTCE_NEXTPTR_1]], align 8 +; CHECK-NEXT: [[MTCE_LINKPTR_2:%.*]] = getelementptr inbounds [[MTCE_LISTTY_2]], %mtce.listty.2* [[MTCE_ITEMPTR]], i64 2, i32 1 +; CHECK-NEXT: br label [[MTCE_LOOP_1]] +; CHECK: mtce.allocfromfree: +; CHECK-NEXT: [[MTCE_PULLEDITEMGEP_0:%.*]] = getelementptr inbounds [[MTCE_LISTTY_2]], %mtce.listty.2* [[MTCE_FREELIST]], i64 0 +; CHECK-NEXT: [[MTCE_FREELIST_NEXTPTR:%.*]] = getelementptr inbounds [[MTCE_LISTTY_2]], %mtce.listty.2* [[MTCE_FREELIST]], i64 2, i32 1, i32 0 +; CHECK-NEXT: [[MTCE_NEWFREELIST:%.*]] = load %mtce.listty.2*, %mtce.listty.2** [[MTCE_FREELIST_NEXTPTR]], align 8 +; CHECK-NEXT: br label [[MTCE_LOOP_1]] +; CHECK: mtce.loop.1: +; CHECK-NEXT: [[MTCE_ALLOCATED:%.*]] = phi %mtce.listty.2* [ [[MTCE_ITEMPTR]], [[MTCE_ALLOCA]] ], [ [[MTCE_FREELIST]], [[MTCE_ALLOCFROMFREE]] ] +; CHECK-NEXT: [[MTCE_FREELIST_NEXT]] = phi %mtce.listty.2* [ [[MTCE_FREELIST]], [[MTCE_ALLOCA]] ], [ [[MTCE_NEWFREELIST]], [[MTCE_ALLOCFROMFREE]] ] +; CHECK-NEXT: [[MTCE_NEXTPTR_2:%.*]] = getelementptr inbounds [[MTCE_LISTTY_2]], %mtce.listty.2* [[MTCE_ALLOCATED]], i64 2, i32 1, i32 0 +; CHECK-NEXT: store %mtce.listty.2* [[MTCE_LISTHEAD]], %mtce.listty.2** [[MTCE_NEXTPTR_2]], align 8 +; CHECK-NEXT: [[MTCE_ARG_0_OP0:%.*]] = insertvalue { %quadtree_ty** } undef, %quadtree_ty** [[C1PTR]], 0 +; CHECK-NEXT: [[MTCE_OPERANDSGEP_0:%.*]] = getelementptr inbounds [[MTCE_LISTTY_2]], %mtce.listty.2* [[MTCE_ALLOCATED]], i64 0, i32 0 +; CHECK-NEXT: store { %quadtree_ty** } [[MTCE_ARG_0_OP0]], { %quadtree_ty** }* [[MTCE_OPERANDSGEP_0]], align 8 +; CHECK-NEXT: [[MTCE_ARG_1_OP0:%.*]] = insertvalue { %quadtree_ty** } undef, %quadtree_ty** [[C2PTR]], 0 +; CHECK-NEXT: [[MTCE_OPERANDSGEP_1:%.*]] = getelementptr inbounds [[MTCE_LISTTY_2]], %mtce.listty.2* [[MTCE_ALLOCATED]], i64 1, i32 0 +; CHECK-NEXT: store { %quadtree_ty** } [[MTCE_ARG_1_OP0]], { %quadtree_ty** }* [[MTCE_OPERANDSGEP_1]], align 8 +; CHECK-NEXT: [[MTCE_ARG_2_OP0:%.*]] = insertvalue { %quadtree_ty** } undef, %quadtree_ty** [[C3PTR]], 0 +; CHECK-NEXT: [[MTCE_OPERANDSGEP_2:%.*]] = getelementptr inbounds [[MTCE_LISTTY_2]], %mtce.listty.2* [[MTCE_ALLOCATED]], i64 2, i32 0 +; CHECK-NEXT: store { %quadtree_ty** } [[MTCE_ARG_2_OP0]], { %quadtree_ty** }* [[MTCE_OPERANDSGEP_2]], align 8 +; CHECK-NEXT: [[MTCE_NEXTPTR_FOR_0:%.*]] = getelementptr inbounds [[MTCE_LISTTY_2]], %mtce.listty.2* [[MTCE_ALLOCATED]], i64 0 +; CHECK-NEXT: [[MTCE_PTOI_0:%.*]] = ptrtoint %mtce.listty.2* [[MTCE_NEXTPTR_FOR_0]] to i64 +; CHECK-NEXT: [[MTCE_LISTHEAD_MSKED_INTPTR1:%.*]] = or i64 [[MTCE_PTOI_0]], 1 +; CHECK-NEXT: [[MTCE_TAGGED_NEXT_FOR_0]] = inttoptr i64 [[MTCE_LISTHEAD_MSKED_INTPTR1]] to %mtce.listty.2* +; CHECK-NEXT: br label [[MTCE_LOOP]] +; CHECK: mtce.return: +; CHECK-NEXT: [[MTCE_PTOI:%.*]] = ptrtoint %mtce.listty.2* [[MTCE_LISTHEAD]] to i64 +; CHECK-NEXT: [[MTCE_MARK:%.*]] = trunc i64 [[MTCE_PTOI]] to i1 +; CHECK-NEXT: [[MTCE_MASKED_WORKLIST_INTPTR:%.*]] = and i64 [[MTCE_PTOI]], -2 +; CHECK-NEXT: [[MTCE_MASKED_WORKLIST:%.*]] = inttoptr i64 [[MTCE_MASKED_WORKLIST_INTPTR]] to %mtce.listty.2* +; CHECK-NEXT: br i1 [[MTCE_MARK]], label [[MTCE_UNMARKED]], label [[MTCE_CHECK_RETURN:%.*]] +; CHECK: mtce.unmarked: +; CHECK-NEXT: [[MTCE_UMLOAD]] = load [[MTCE_LISTTY_2]], %mtce.listty.2* [[MTCE_MASKED_WORKLIST]], align 8 +; CHECK-NEXT: br label [[MTCE_LOOP_FROM_WORKLIST]] +; CHECK: mtce.check.return: +; CHECK-NEXT: [[MTCE_WORKLIST_IS_NULL_TG:%.*]] = icmp eq %mtce.listty.2* [[MTCE_LISTHEAD]], null +; CHECK-NEXT: br i1 [[MTCE_WORKLIST_IS_NULL_TG]], label [[MTCE_RETONLY:%.*]], label [[MTCE_ADD_FREELIST]] +; CHECK: mtce.add.freelist: +; CHECK-NEXT: [[MTCE_MLOAD]] = load [[MTCE_LISTTY_2]], %mtce.listty.2* [[MTCE_LISTHEAD]], align 8 +; CHECK-NEXT: [[MTCE_FREELIST_ADDED]] = getelementptr inbounds [[MTCE_LISTTY_2]], %mtce.listty.2* [[MTCE_LISTHEAD]], i64 -2 +; CHECK-NEXT: [[MTCE_FREENEXTPTR:%.*]] = getelementptr inbounds [[MTCE_LISTTY_2]], %mtce.listty.2* [[MTCE_FREELIST_ADDED]], i64 2, i32 1, i32 0 +; CHECK-NEXT: store %mtce.listty.2* [[MTCE_FREELIST]], %mtce.listty.2** [[MTCE_FREENEXTPTR]], align 8 +; CHECK-NEXT: br label [[MTCE_LOOP_FROM_WORKLIST]] +; CHECK: mtce.retonly: +; CHECK-NEXT: ret void +; +entry: + %c0ptr = getelementptr inbounds %quadtree_ty, %quadtree_ty* %this, i64 0, i32 0, i32 0 + %c0 = load %quadtree_ty*, %quadtree_ty** %c0ptr + call void @quadtree_H(%quadtree_ty* %c0, i32 %a2, i32 %a3) + %c1ptr = getelementptr inbounds %quadtree_ty, %quadtree_ty* %this, i64 0, i32 0, i32 1 + %c1 = load %quadtree_ty*, %quadtree_ty** %c1ptr + call void @quadtree_H(%quadtree_ty* %c1, i32 %a2, i32 %a3) + %c2ptr = getelementptr inbounds %quadtree_ty, %quadtree_ty* %this, i64 0, i32 0, i32 2 + %c2 = load %quadtree_ty*, %quadtree_ty** %c2ptr + call void @quadtree_H(%quadtree_ty* %c2, i32 %a2, i32 %a3) + %c3ptr = getelementptr inbounds %quadtree_ty, %quadtree_ty* %this, i64 0, i32 0, i32 3 + %c3 = load %quadtree_ty*, %quadtree_ty** %c3ptr + call void @quadtree_H(%quadtree_ty* %c3, i32 %a2, i32 %a3) + ret void +} + +; Traverse a full quad-tree where function H is different for one call +define void @quadtree_invalH(%quadtree_ty* %this, i32 %a2, i32 %a3) { +; CHECK-LABEL: @quadtree_invalH( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[C0PTR:%.*]] = getelementptr inbounds [[QUADTREE_TY:%.*]], %quadtree_ty* [[THIS:%.*]], i64 0, i32 0, i32 0 +; CHECK-NEXT: [[C0:%.*]] = load %quadtree_ty*, %quadtree_ty** [[C0PTR]] +; CHECK-NEXT: [[C1PTR:%.*]] = getelementptr inbounds [[QUADTREE_TY]], %quadtree_ty* [[THIS]], i64 0, i32 0, i32 1 +; CHECK-NEXT: [[C1:%.*]] = load %quadtree_ty*, %quadtree_ty** [[C1PTR]] +; CHECK-NEXT: call void @quadtree_invalH(%quadtree_ty* [[C0]], i32 [[A2:%.*]], i32 [[A3:%.*]]) +; CHECK-NEXT: call void @quadtree_invalH(%quadtree_ty* [[C1]], i32 [[A2]], i32 [[A3]]) +; CHECK-NEXT: [[C2PTR:%.*]] = getelementptr inbounds [[QUADTREE_TY]], %quadtree_ty* [[THIS]], i64 0, i32 0, i32 2 +; CHECK-NEXT: [[C2:%.*]] = load %quadtree_ty*, %quadtree_ty** [[C2PTR]] +; CHECK-NEXT: call void @quadtree_invalH(%quadtree_ty* [[C2]], i32 [[A2]], i32 [[A3]]) +; CHECK-NEXT: [[C3PTR:%.*]] = getelementptr inbounds [[QUADTREE_TY]], %quadtree_ty* [[THIS]], i64 0, i32 0, i32 3 +; CHECK-NEXT: [[C3:%.*]] = load %quadtree_ty*, %quadtree_ty** [[C3PTR]] +; CHECK-NEXT: call void @quadtree_invalH(%quadtree_ty* [[C3]], i32 [[A2]], i32 [[A3]]) +; CHECK-NEXT: ret void +; +entry: + %c0ptr = getelementptr inbounds %quadtree_ty, %quadtree_ty* %this, i64 0, i32 0, i32 0 + %c0 = load %quadtree_ty*, %quadtree_ty** %c0ptr + %c1ptr = getelementptr inbounds %quadtree_ty, %quadtree_ty* %this, i64 0, i32 0, i32 1 + %c1 = load %quadtree_ty*, %quadtree_ty** %c1ptr + call void @quadtree_invalH(%quadtree_ty* %c0, i32 %a2, i32 %a3) + call void @quadtree_invalH(%quadtree_ty* %c1, i32 %a2, i32 %a3) + %c2ptr = getelementptr inbounds %quadtree_ty, %quadtree_ty* %this, i64 0, i32 0, i32 2 + %c2 = load %quadtree_ty*, %quadtree_ty** %c2ptr + call void @quadtree_invalH(%quadtree_ty* %c2, i32 %a2, i32 %a3) + %c3ptr = getelementptr inbounds %quadtree_ty, %quadtree_ty* %this, i64 0, i32 0, i32 3 + %c3 = load %quadtree_ty*, %quadtree_ty** %c3ptr + call void @quadtree_invalH(%quadtree_ty* %c3, i32 %a2, i32 %a3) + ret void +} Index: test/Transforms/MultiTailCallElimination/lit.local.cfg =================================================================== --- /dev/null +++ test/Transforms/MultiTailCallElimination/lit.local.cfg @@ -0,0 +1,2 @@ +if not 'AArch64' in config.root.targets: + config.unsupported = True