Index: include/llvm/InitializePasses.h =================================================================== --- include/llvm/InitializePasses.h +++ include/llvm/InitializePasses.h @@ -403,6 +403,7 @@ void initializeTargetPassConfigPass(PassRegistry&); void initializeTargetTransformInfoWrapperPassPass(PassRegistry&); void initializeThreadSanitizerLegacyPassPass(PassRegistry&); +void initializeTreeHeightReductionPass(PassRegistry&); void initializeTwoAddressInstructionPassPass(PassRegistry&); void initializeTypeBasedAAWrapperPassPass(PassRegistry&); void initializeUnifyFunctionExitNodesPass(PassRegistry&); Index: include/llvm/Transforms/IPO/PassManagerBuilder.h =================================================================== --- include/llvm/Transforms/IPO/PassManagerBuilder.h +++ include/llvm/Transforms/IPO/PassManagerBuilder.h @@ -183,6 +183,9 @@ /// Path of the sample Profile data file. std::string PGOSampleUse; + /// Path of Tree-Heght-reduction + bool EvalConcurrent; + private: /// ExtensionList - This is list of all of the extensions that are registered. std::vector> Extensions; Index: include/llvm/Transforms/Scalar.h =================================================================== --- include/llvm/Transforms/Scalar.h +++ include/llvm/Transforms/Scalar.h @@ -509,6 +509,12 @@ // transformations. // Pass *createWarnMissedTransformationsPass(); + +//===----------------------------------------------------------------------===// +// TreeHeightReduction - Reduce tree height. +// +Pass *createTreeHeightReductionPass(bool EnableFpTHR = false); + } // End llvm namespace #endif Index: lib/Transforms/IPO/PassManagerBuilder.cpp =================================================================== --- lib/Transforms/IPO/PassManagerBuilder.cpp +++ lib/Transforms/IPO/PassManagerBuilder.cpp @@ -175,6 +175,7 @@ PrepareForThinLTO = EnablePrepareForThinLTO; PerformThinLTO = EnablePerformThinLTO; DivergentTarget = false; + EvalConcurrent = false; } PassManagerBuilder::~PassManagerBuilder() { @@ -379,6 +380,9 @@ if (EnableLoopInterchange) MPM.add(createLoopInterchangePass()); // Interchange loops + if (OptLevel > 0) + MPM.add(createTreeHeightReductionPass(EvalConcurrent)); + // Unroll small loops MPM.add(createSimpleLoopUnrollPass(OptLevel, DisableUnrollLoops, ForgetAllSCEVInLoopUnroll)); Index: lib/Transforms/Scalar/CMakeLists.txt =================================================================== --- lib/Transforms/Scalar/CMakeLists.txt +++ lib/Transforms/Scalar/CMakeLists.txt @@ -72,6 +72,7 @@ StructurizeCFG.cpp TailRecursionElimination.cpp WarnMissedTransforms.cpp + TreeHeightReduction.cpp ADDITIONAL_HEADER_DIRS ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms Index: lib/Transforms/Scalar/Scalar.cpp =================================================================== --- lib/Transforms/Scalar/Scalar.cpp +++ lib/Transforms/Scalar/Scalar.cpp @@ -97,6 +97,7 @@ initializeSimpleLoopUnswitchLegacyPassPass(Registry); initializeSinkingLegacyPassPass(Registry); initializeTailCallElimPass(Registry); + initializeTreeHeightReductionPass(Registry); initializeSeparateConstOffsetFromGEPPass(Registry); initializeSpeculativeExecutionLegacyPassPass(Registry); initializeStraightLineStrengthReducePass(Registry); Index: lib/Transforms/Scalar/TreeHeightReduction.cpp =================================================================== --- lib/Transforms/Scalar/TreeHeightReduction.cpp +++ lib/Transforms/Scalar/TreeHeightReduction.cpp @@ -0,0 +1,810 @@ +//===------------ TreeHeightReduction.cpp - Reduce tree height ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass implements a tree-height-reduction pass. +// +// Tree height reduction is an optimization to increase instruction level +// parallelism by transforming an operation tree like the following. +// +// Before After +// +// _ N1 _ _________ N1 _________ +// | | | | +// _ N2 _ L1 ___ N2 ___ ___ N3 ___ +// | | | | | | +// _ N3 _ L2 ---> _ N4 _ _ N5 _ _ N6 _ _ N7 _ +// | | | | | | | | | | +// _ N4 _ L3 L1 L2 L3 L4 L5 L6 L7 L8 +// | | +// _ N5 _ L4 +// | | +// _ N6 _ L5 +// | | +// _ N7 _ L6 +// | | +// L7 L8 +// +//===----------------------------------------------------------------------===// +// +// An argorithm of tree height reduction is based on the paper: +// Katherine Coons, Warren Hunt, Bertrand A. Maher, Doug Burger, +// Kathryn S. McKinley. Optimal Huffman Tree-Height Reduction for +// Instruction-Level Parallelism. +// + +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Twine.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/User.h" +#include "llvm/IR/Value.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" +#include +#include +#include + +using namespace llvm; + +#define THR_NAME "tree-height-reduction" +#define DEBUG_TYPE THR_NAME + +// An option not to apply tree height reduction to integer instructions. +static cl::opt +DisableIntTHR("disable-int-thr", + cl::desc("Disable tree height reduction to integer insts"), + cl::init(false), cl::ReallyHidden); + +// An option not to apply tree height reduction to floating-point instructions. +static cl::opt +EnableFpTHR("enable-fp-thr", + cl::desc("Enable tree height reduction to floating-point insts"), + cl::init(false), cl::ReallyHidden); + +namespace { + class Node { + public: + explicit Node(Value *V) + : Inst(nullptr), Opcode(0), DefOp(V), Parent(nullptr), + Left(nullptr), Right(nullptr), Latency(0), TotalCost(0) { + if (Instruction *I = dyn_cast(V)) { + Inst = I; + Opcode = I->getOpcode(); + } + } + + /// Set parent node of current node. + void setParent(Node *P) { Parent = P; } + + /// Set left node of current node. + void setLeft(Node *L) { Left = L; } + + /// Set right node of current node. + void setRight(Node *R) { Right = R; } + + /// Set latency of current node. + void setLatency(int L) { Latency = L; } + + /// Set total cost of current node. + void setTotalCost(int C) { TotalCost = C; } + + /// Get original instruction of current node. + Instruction *getOrgInst() const { + assert(Inst && "Inst should not be nullptr."); + return Inst; + } + + /// Get parent node of current node. + Node *getParent() const { return Parent; } + + /// Get left node of current node. + Node *getLeft() const { return Left; } + + /// Get right node of current node. + Node *getRight() const { return Right; } + + /// Get operator code of current node. + unsigned getOpcode() const { return Opcode; } + + /// Get latency of current node. + int getLatency() const { return Latency; } + + /// Get total cost of current node. + int getTotalCost() const { return TotalCost; } + + /// Get defined value of current node. + Value *getDefinedValue() const { + assert(DefOp && "DefOp should not be nullptr"); + return DefOp; + } + + /// Return true if current node is a node. + bool isNode() const { return Left != nullptr && Right != nullptr; } + + /// Return true if current node is a pure leaf. + bool isLeaf() const { return !isNode(); } + + /// Return true if current node is a root. + bool isRoot() const { return Parent == nullptr; } + + /// Return true if current node is considered as a leaf node. + /// Tree height reduction is applied only to nodes whose operator code is + /// same, so it is necessary to consider a node whose operation is not + /// commutative or whose operator code is different from its parent's + /// operator code as a leaf node. + bool isConsideredAsLeaf() const { + if (isLeaf()) + return true; + if (!getOrgInst()->isCommutative()) + return true; + if (!isRoot() && getOpcode() != getParent()->getOpcode()) + return true; + return false; + } + + /// Update tree's latecy under current node. + void updateTreeLatency(TargetTransformInfo *TTI); + + /// Update current node's latency. + void updateNodeLatency(TargetTransformInfo *TTI); + + enum struct UpdateLatecy { UL_Left, UL_Right }; + + // Update left or right node's latency of current node. + void updateLeftOrRightNodeLatency(TargetTransformInfo *TTI, + UpdateLatecy UL); + + private: + // An original instruction of current node. + Instruction *Inst; + // A operator code of current node. + unsigned Opcode; + + // A defined operand used at current node. + // This also includes constant value. + Value *DefOp; + + // A parent node of current node. + Node *Parent; + // A left node and a right node of current node. + Node *Left, *Right; + + // Instruction latency. + int Latency; + // Total cost of nodes under current nodes. + int TotalCost; + }; + + class TreeHeightReduction : public LoopPass { + public: + static char ID; // Pass ID, replacement for typeid + + explicit TreeHeightReduction(bool EC = false) + : LoopPass(ID), EvalConcurrent(EC) { + initializeTreeHeightReductionPass(*PassRegistry::getPassRegistry()); + } + + bool runOnLoop(Loop *L, LPPassManager &LPM) override { + if (skipLoop(L)) + return false; + + TTI = &getAnalysis().getTTI( + *L->getHeader()->getParent()); + ORE = &getAnalysis().getORE(); + + // Tree height reduction is applied only to inner-most loop. + SmallVector Worklist; + for (Loop *CurLoop: depth_first(L)) + if (CurLoop->empty()) + Worklist.push_back(CurLoop); + + bool Changed = false; + for (Loop *L: Worklist) { + auto &LoopBlocks = L->getBlocksVector(); + for (auto *BB: LoopBlocks) { + if (!DisableIntTHR) + Changed |= runOnBasicBlock(BB, TargetInstTy::INTEGER); + if (EnableFpTHR || EvalConcurrent) + Changed |= runOnBasicBlock(BB, TargetInstTy::FLOATING_POINT); + } + } + return Changed; + } + + StringRef getPassName() const override { + return "Tree Height Reduction"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + AU.addPreserved(); + } + + private: + TargetTransformInfo *TTI; + OptimizationRemarkEmitter *ORE; + + bool EvalConcurrent; + + // 'GeneratedInsts' has instructions generated when tree height reduction + // is applied. If it is not applied, 'GeneratedInsts' has nothing. + std::set GeneratedInsts; + + enum struct TargetInstTy { INTEGER, FLOATING_POINT }; + + // This specify type of instruction included in the operation tree. + TargetInstTy CurTargetInstTy; + + // Tree height reduction is applied to each basic block which + // innermost loop contains. + bool runOnBasicBlock(BasicBlock *BB, const TargetInstTy TIT); + + // Return true if 'I' is a target instruction of tree height reduction. + bool isTHRTargetInst(Instruction *I) const; + + // Construct operation tree from value 'V'. + Node *constructTree(Value *V); + + void destructTree(Node *N); + + // Collect original instructions to be erased from BasicBlock. + void collectInstsToBeErasedFrom(Node *N, + std::vector &Insts); + + // Apply tree height reduction to 'N'. + // Return value is a tree to which tree height reduction is applied. + Node *applyTreeHeightReduction(Node *N, bool isLeft); + + // Collect leaf nodes and reused nodes under node 'N'. + void collectLeavesAndReusedNodes(Node *N, std::vector &Leaves, + std::vector &ReusedNodes); + + // Apply tree height reduction to under a node 'N', and construct + // optimized tree. + Node *constructOptimizedSubtree(std::vector &Leaves, + std::vector &ReusedNodes, Node *N); + + // Combine 2 leaf elements, create node from them and put it into 'Leaves'. + void combineLeaves(std::vector &Leaves, + Node *Op1, Node *Op2, + std::vector &ReusedNodes); + + // Create IRs from root node. + bool createIRs(Node *Root); + + // Create optimized integer or floating-point instructions from 'Node'. + // 'Ops' has a operand values. + Value *createInst(IRBuilder<> &Builder, Node *N, Value *Op1, Value *Op2); + + // Return true if 'I' is a root candidate. + bool isRootCandidate(Instruction &I) const { + if (!isTHRTargetInst(&I)) + return false; + for (unsigned i = 0; i < I.getNumOperands(); ++i) + if (isNodeCandidate(I.getOperand(i))) + return true; + return false; + } + + // Return true if 'Op' is a node candidate. + bool isNodeCandidate(Value *Op) const { + assert(Op && "Operand should not be nullptr"); + if (!Op->hasOneUse()) + return false; + if (Instruction *I = dyn_cast(Op)) + return isTHRTargetInst(I); + return false; + } + + void printTree(Node *N, const int Indent) const { + for (int i = 0; i < Indent; ++i) + errs() << " "; + N->getOrgInst()->dump(); + + for (int i = 0; i < Indent + 4; ++i) + errs() << " "; + errs() << "Latency: " << N->getLatency(); + errs() << ", TotalCost: " << N->getTotalCost() << "\n"; + + if (Node *Left = N->getLeft()) + printTree(Left, Indent + 2); + if (Node *Right = N->getRight()) + printTree(Right, Indent + 2); + } + + void printLeaves(std::vector &Leaves, bool isBefore) const { + if (isBefore) + errs() << " --- Before ---\n"; + else + errs() << " --- After ---\n"; + for (auto *Node: Leaves) + printTree(Node, 2); + } + }; +} // end anonymous namespace + +// BFS: Breadth First Search +static std::vector getNodesAndLeavesByBFS(Node *N) { + std::vector Nodes = {N}; + + for (unsigned i = 0; i < Nodes.size(); ++i) { + Node *CurNode = Nodes[i]; + if (Node *L = CurNode->getLeft()) + Nodes.push_back(L); + if (Node *R = CurNode->getRight()) + Nodes.push_back(R); + } + + return std::move(Nodes); +} + +void Node::updateTreeLatency(TargetTransformInfo *TTI) { + std::vector Nodes = getNodesAndLeavesByBFS(this); + // Update node latency by bottom-up order. + for (auto Begin = Nodes.rbegin(), End = Nodes.rend(); + Begin != End; ++Begin) { + Node *CurNode = *Begin; + CurNode->updateNodeLatency(TTI); + } +} + +void Node::updateNodeLatency(TargetTransformInfo *TTI) { + setLatency(0); + setTotalCost(0); + + if (isLeaf()) + return; + + updateLeftOrRightNodeLatency(TTI, UpdateLatecy::UL_Left); + updateLeftOrRightNodeLatency(TTI, UpdateLatecy::UL_Right); + + Instruction *Inst = getOrgInst(); + const int InstLatency = + TTI->getInstructionCost(Inst, TargetTransformInfo::TCK_Latency); + setLatency(getLatency() + InstLatency); + setTotalCost(getTotalCost() + InstLatency); +} + +void Node::updateLeftOrRightNodeLatency(TargetTransformInfo *TTI, + UpdateLatecy UL) { + const int Latency = getLatency(); + + Node *SubNode = nullptr; + switch (UL) { + case UpdateLatecy::UL_Left: { SubNode = getLeft(); break; } + case UpdateLatecy::UL_Right: { SubNode = getRight(); break; } + default: llvm_unreachable("Should not reach here."); + } + + assert(SubNode && "Left or right node should not be nullptr."); + const int SubNodeLatency = SubNode->getLatency(); + if (SubNodeLatency > Latency) + setLatency(SubNodeLatency); + setTotalCost(getTotalCost() + SubNode->getTotalCost()); +} + +static void eraseOrgInsts(std::vector &Insts) { + for (auto *I: Insts) + if (I) + I->eraseFromParent(); +} + +static bool isLegalToApply(Node *Root) { + std::vector Worklist = getNodesAndLeavesByBFS(Root); + int NumLeaves = 0; + + for (auto *N: Worklist) { + if (N->isLeaf()) + ++NumLeaves; + // We consider that it is worth applying tree height reduction + // if the number of leaves is equal to or more than 4. + if (NumLeaves >= 4) + return true; + } + + return false; +} + +bool TreeHeightReduction::runOnBasicBlock(BasicBlock *BB, + const TargetInstTy TIT) { + bool Applied = false; + CurTargetInstTy = TIT; + + // Search target instruction in basic block reversely. + for (auto Begin = BB->rbegin(); Begin != BB->rend(); ++Begin) { + Instruction &I = *Begin; + if (GeneratedInsts.count(&I) == 1) + continue; + if (!isRootCandidate(I)) + continue; + + // Construct operation tree from root instruction. + Value *V = dyn_cast(&I); + assert(V && "Defined value should not be nullptr."); + Node *OrgTree = constructTree(V); + if (!OrgTree) + continue; + + if (!isLegalToApply(OrgTree)) { + destructTree(OrgTree); + continue; + } + + std::vector Insts; + collectInstsToBeErasedFrom(OrgTree, Insts); + + // Apply tree height reduction to constructed tree 'OrgTree', + // and get tree 'ReducedTree' to which it is applied. + Node *ReducedTree = applyTreeHeightReduction(OrgTree, true); + if (!ReducedTree) { + destructTree(OrgTree); + continue; + } + + assert(ReducedTree->getOrgInst() == OrgTree->getOrgInst() && + "OrgInst of ReducedTree and OrgTree should be same."); + + // Create IRs from tree to which tree height reduction is applied. + if (createIRs(ReducedTree)) { + // The following optimization message output process must be called + // before 'eraseOrgInsts' because 'I' is erase there. + ORE->emit([&]() { + return OptimizationRemark(DEBUG_TYPE, "TreeHeightReduction", &I) + << "reduced tree height"; + }); + + --Begin; + eraseOrgInsts(Insts); + Applied = true; + } + + // 'OrgTree' and 'ReducedTree' shares memory, so it is enough to + // release memory of 'ReducedTree'. + destructTree(ReducedTree); + } + + GeneratedInsts.clear(); + return Applied; +} + +template +static bool isUsedAtCmpInst(Instruction *I) { + for (User *U: I->users()) { + Instruction *UserInst = dyn_cast(U); + if (UserInst && isa(*UserInst)) + return true; + } + return false; +} + +// Return true if 'I' is integer type and is target instruction of tree +// height reduction. +static bool isIntgerInstTHRTarget(Instruction *I) { + if (!I->getType()->isIntegerTy()) + return false; + // 'I' which is used at ICmpInst may be an induction variable. + if (isUsedAtCmpInst(I)) + return false; + switch (I->getOpcode()) { + case Instruction::Add: + case Instruction::Mul: + return true; + default: + return false; + } +} + +// Return true if 'I' is floating-point type and is target instruction +// of tree height reduction. +static bool isFpInstTHRTarget(Instruction *I) { + if (!I->getType()->isFloatingPointTy()) + return false; + if (!I->isFast()) + return false; + switch (I->getOpcode()) { + case Instruction::FAdd: + case Instruction::FMul: + return true; + default: + return false; + } +} + +bool TreeHeightReduction::isTHRTargetInst(Instruction *I) const { + switch (CurTargetInstTy) { + case TargetInstTy::INTEGER: return isIntgerInstTHRTarget(I); + case TargetInstTy::FLOATING_POINT: return isFpInstTHRTarget(I); + default: return false; + } +} + +Node *TreeHeightReduction::constructTree(Value *V) { + if (!isNodeCandidate(V)) + return new Node(V); + + Instruction *I = dyn_cast(V); + assert(I && "Instruction should not be nullptr."); + assert(I->getNumOperands() == 2 && "The number of operands should be 2."); + + Node *Parent = new Node(V); + assert(Parent && "Should not be nullptr."); + + Value *LeftOp = I->getOperand(0); + Node *Left = constructTree(LeftOp); + assert(Left && "Should not be nullptr."); + + if (Left) { + Parent->setLeft(Left); + Left->setParent(Parent); + } + + Value *RightOp = I->getOperand(1); + Node *Right = constructTree(RightOp); + assert(Right && "Should not be nullptr."); + + if (Right) { + Parent->setRight(Right); + Right->setParent(Parent); + } + + return Parent; +} + +void TreeHeightReduction::destructTree(Node *N) { + std::vector Nodes = getNodesAndLeavesByBFS(N); + for (auto *CurNode: Nodes) + delete CurNode; +} + +void TreeHeightReduction::collectInstsToBeErasedFrom( + Node *N, std::vector &Insts) { + std::vector Nodes = getNodesAndLeavesByBFS(N); + for (auto *CurNode: Nodes) + // Instruction belonging to leaf node should be saved. + if (!CurNode->isLeaf()) + Insts.push_back(CurNode->getOrgInst()); +} + +Node *TreeHeightReduction::applyTreeHeightReduction(Node *N, bool isLeft) { + // Postorder depth-first search. + if (!N->isNode()) + return N; + + if (Node *Left = N->getLeft()) + applyTreeHeightReduction(Left, true); + if (Node *Right = N->getRight()) + applyTreeHeightReduction(Right, false); + + if (!isTHRTargetInst(N->getOrgInst())) + return N; + + // Save original parent information. + Node *Parent = N->getParent(); + + std::vector Leaves; + // 'ReusedNodes' holds nodes which is reused when updating parent and child + // node's relationship at the time of reducing operation tree. + // By doing so, the amount of memory used can be reduced. + std::vector ReusedNodes; + collectLeavesAndReusedNodes(N, Leaves, ReusedNodes); + + Node *NewNode = constructOptimizedSubtree(Leaves, ReusedNodes, N); + NewNode->setParent(Parent); + if (Parent) { + if (isLeft) + Parent->setLeft(NewNode); + else + Parent->setRight(NewNode); + } + // Return value has meaning only if 'Parent' is nullptr because + // this means 'Node' is a root node. + return NewNode; +} + +void TreeHeightReduction::collectLeavesAndReusedNodes( + Node *N, std::vector &Leaves, std::vector &ReusedNodes) { + std::vector Worklist = {N}; + + // NOTE: Don't use 'getNodesAndLeavesByBFS'! Like this function, the below + // processing is the one which collects all nodes and all leaves by + // BFS, but it is a little different from this function because + // it is BFS with a condition. + for (unsigned i = 0; i < Worklist.size(); ++i) { + Node *CurNode = Worklist[i]; + if (CurNode->isConsideredAsLeaf()) { + Leaves.push_back(CurNode); + continue; + } + + ReusedNodes.push_back(CurNode); + if (Node *Left = CurNode->getLeft()) + Worklist.push_back(Left); + if (Node *Right = CurNode->getRight()) + Worklist.push_back(Right); + } +} + +Node *TreeHeightReduction::constructOptimizedSubtree( + std::vector &Leaves, std::vector &ReusedNodes, Node *N) { + auto removeNodeFromLeaves = [&] (Node *N) -> void { + for (auto Begin = Leaves.begin(); Begin != Leaves.end(); ++Begin) + if (*Begin == N) { + Leaves.erase(Begin); + --Begin; + } + }; + + while (Leaves.size() > 1) { + std::sort(Leaves.begin(), Leaves.end(), + [] (Node *LHS, Node *RHS) -> bool { + if (LHS->getLatency() != RHS->getLatency()) + return LHS->getLatency() < RHS->getLatency(); + if (LHS->getTotalCost() != RHS->getTotalCost()) + return LHS->getTotalCost() < RHS->getTotalCost(); + return false; + }); + LLVM_DEBUG(printLeaves(Leaves, true)); + + Node *Op1 = Leaves[0], *Op2 = Leaves[1]; + combineLeaves(Leaves, Op1, Op2, ReusedNodes); + removeNodeFromLeaves(Op1); + removeNodeFromLeaves(Op2); + + LLVM_DEBUG(printLeaves(Leaves, false)); + } + + Node *NewNode = Leaves.front(); + NewNode->updateTreeLatency(TTI); + return NewNode; +} + +void TreeHeightReduction::combineLeaves(std::vector &Leaves, + Node *Op1, Node *Op2, + std::vector & ReusedNodes) { + Node *N = ReusedNodes.back(); + ReusedNodes.pop_back(); + + // Update child and its parent relationship. + N->setLeft(Op1); + N->setRight(Op2); + Op1->setParent(N); + Op2->setParent(N); + + N->updateTreeLatency(TTI); + Leaves.push_back(N); +} + +static std::vector getOnlyNodes(Node *N) { + std::vector Nodes = getNodesAndLeavesByBFS(N); + + // Exclude only leaves. + for (auto Iter = Nodes.begin(); Iter != Nodes.end(); ++Iter) + if (!(*Iter)->isNode()) { + Nodes.erase(Iter); + --Iter; + } + + return std::move(Nodes); +} + +static void setLeftAndRightOperand(Value *&LeftOp, Value *&RightOp, + Node *N, std::queue &Ops) { + auto frontPopVal = [] (std::queue &Ops) -> Value * { + Value *V = Ops.front(); + Ops.pop(); + return V; + }; + + Node *L = N->getLeft(); + Node *R = N->getRight(); + if (L->isLeaf() && R->isLeaf()) { + // Both left child and right child of 'N' are leaves. + LeftOp = L->getDefinedValue(); + RightOp = R->getDefinedValue(); + } + else { + assert(!Ops.empty()); + if (L->isLeaf()) { + // Left child is a leaf and right child is a node. + LeftOp = L->getDefinedValue(); + RightOp = frontPopVal(Ops); + } + else if (R->isLeaf()) { + // Left child is a node and right child is a leaf. + LeftOp = frontPopVal(Ops); + RightOp = R->getDefinedValue(); + } + else { + // Both left child and right child of 'N' are nodes. + LeftOp = frontPopVal(Ops); + RightOp = frontPopVal(Ops); + } + } +} + +bool TreeHeightReduction::createIRs(Node *Root) { + Instruction *RootInst = Root->getOrgInst(); + IRBuilder<> Builder(RootInst); + + std::vector Nodes = getOnlyNodes(Root); + std::queue Ops; + while (!Nodes.empty()) { + Node *N = Nodes.back(); + + Value *LeftOp = nullptr, *RightOp = nullptr; + setLeftAndRightOperand(LeftOp, RightOp, N, Ops); + + Value *NewNodeValue = createInst(Builder, N, LeftOp, RightOp); + Ops.push(NewNodeValue); + GeneratedInsts.insert(dyn_cast(NewNodeValue)); + + Nodes.pop_back(); + } + + assert(Ops.size() == 1 && "The size of queue should be 1."); + Value *NewRootValue = Ops.front(); + GeneratedInsts.insert(dyn_cast(NewRootValue)); + RootInst->replaceAllUsesWith(NewRootValue); + return true; +} + +Value *TreeHeightReduction::createInst(IRBuilder<> &Builder, Node *N, + Value *Op1, Value *Op2) { + Value *V; + switch (N->getOpcode()) { + case Instruction::Add: { + V = Builder.CreateAdd(Op1, Op2, Twine("thr.add")); + break; + } + case Instruction::Mul: { + V = Builder.CreateMul(Op1, Op2, Twine("thr.mul")); + break; + } + case Instruction::FAdd: { + V = Builder.CreateFAdd(Op1, Op2, Twine("thr.fadd")); + break; + } + case Instruction::FMul: { + V = Builder.CreateFMul(Op1, Op2, Twine("thr.fmul")); + break; + } + default: + return nullptr; + } + + // Take over the original instruction IR flags. + Instruction *NewInst = dyn_cast(V); + if (Value *OrgDef = N->getDefinedValue()) + NewInst->copyIRFlags(OrgDef, true); + + return V; +} + +char TreeHeightReduction::ID = 0; + +INITIALIZE_PASS_BEGIN(TreeHeightReduction, "tree-height-reduction", + "tree height reduction", false, false) +INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_END(TreeHeightReduction, "tree-height-reduction", + "tree height reduction", false, false) + +Pass *llvm::createTreeHeightReductionPass(bool EvalConcurrent) { + return new TreeHeightReduction(EvalConcurrent); +} Index: test/Other/opt-O2-pipeline.ll =================================================================== --- test/Other/opt-O2-pipeline.ll +++ test/Other/opt-O2-pipeline.ll @@ -122,6 +122,20 @@ ; CHECK-NEXT: Induction Variable Simplification ; CHECK-NEXT: Recognize loop idioms ; CHECK-NEXT: Delete dead loops +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis +; CHECK-NEXT: Optimization Remark Emitter +; CHECK-NEXT: Loop Pass Manager +; CHECK-NEXT: Tree Height Reduction +; CHECK-NEXT: Dominator Tree Construction +; CHECK-NEXT: Natural Loop Information +; CHECK-NEXT: Canonicalize natural loops +; CHECK-NEXT: LCSSA Verifier +; CHECK-NEXT: Loop-Closed SSA Form Pass +; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) +; CHECK-NEXT: Function Alias Analysis Results +; CHECK-NEXT: Scalar Evolution Analysis +; CHECK-NEXT: Loop Pass Manager ; CHECK-NEXT: Unroll loops ; CHECK-NEXT: MergedLoadStoreMotion ; CHECK-NEXT: Phi Values Analysis Index: test/Other/opt-O3-pipeline.ll =================================================================== --- test/Other/opt-O3-pipeline.ll +++ test/Other/opt-O3-pipeline.ll @@ -127,6 +127,20 @@ ; CHECK-NEXT: Induction Variable Simplification ; CHECK-NEXT: Recognize loop idioms ; CHECK-NEXT: Delete dead loops +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis +; CHECK-NEXT: Optimization Remark Emitter +; CHECK-NEXT: Loop Pass Manager +; CHECK-NEXT: Tree Height Reduction +; CHECK-NEXT: Dominator Tree Construction +; CHECK-NEXT: Natural Loop Information +; CHECK-NEXT: Canonicalize natural loops +; CHECK-NEXT: LCSSA Verifier +; CHECK-NEXT: Loop-Closed SSA Form Pass +; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) +; CHECK-NEXT: Function Alias Analysis Results +; CHECK-NEXT: Scalar Evolution Analysis +; CHECK-NEXT: Loop Pass Manager ; CHECK-NEXT: Unroll loops ; CHECK-NEXT: MergedLoadStoreMotion ; CHECK-NEXT: Phi Values Analysis Index: test/Other/opt-Os-pipeline.ll =================================================================== --- test/Other/opt-Os-pipeline.ll +++ test/Other/opt-Os-pipeline.ll @@ -109,6 +109,20 @@ ; CHECK-NEXT: Induction Variable Simplification ; CHECK-NEXT: Recognize loop idioms ; CHECK-NEXT: Delete dead loops +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis +; CHECK-NEXT: Optimization Remark Emitter +; CHECK-NEXT: Loop Pass Manager +; CHECK-NEXT: Tree Height Reduction +; CHECK-NEXT: Dominator Tree Construction +; CHECK-NEXT: Natural Loop Information +; CHECK-NEXT: Canonicalize natural loops +; CHECK-NEXT: LCSSA Verifier +; CHECK-NEXT: Loop-Closed SSA Form Pass +; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) +; CHECK-NEXT: Function Alias Analysis Results +; CHECK-NEXT: Scalar Evolution Analysis +; CHECK-NEXT: Loop Pass Manager ; CHECK-NEXT: Unroll loops ; CHECK-NEXT: MergedLoadStoreMotion ; CHECK-NEXT: Phi Values Analysis Index: test/Other/pass-pipelines.ll =================================================================== --- test/Other/pass-pipelines.ll +++ test/Other/pass-pipelines.ll @@ -60,6 +60,13 @@ ; CHECK-O2: Combine redundant instructions ; CHECK-O2-NOT: Manager ; CHECK-O2: Loop Pass Manager +; CHECK-O2-NEXT: Induction Variable Simplification +; CHECK-O2-NOT: Manager +; CHECK-O2: Loop Pass Manager +; CHECK-O2-NEXT: Tree Height Reduction +; CHECK-O2-NOT: Manager +; CHECK-O2: Loop Pass Manager +; CHECK-O2-NEXT: Unroll loops ; CHECK-O2-NOT: Manager ; FIXME: It isn't clear that we need yet another loop pass pipeline ; and run of LICM here. Index: test/Transforms/TreeHeightReduction/AArch64/fp16-add-with-constant.ll =================================================================== --- test/Transforms/TreeHeightReduction/AArch64/fp16-add-with-constant.ll +++ test/Transforms/TreeHeightReduction/AArch64/fp16-add-with-constant.ll @@ -0,0 +1,55 @@ +; RUN: opt -O1 -enable-fp-thr -S < %s | FileCheck %s + +; CHECK-LABEL: @add_half_with_constant( +; CHECK: %[[V0:.*]] = fadd fast half {{.*}}, {{.*}} +; CHECK-NEXT: %[[V1:.*]] = fadd fast half {{.*}}, %[[V0]] +; CHECK-NEXT: %[[V2:.*]] = fadd fast half {{.*}}, 0xH4900 +; CHECK-NEXT: %[[V3:.*]] = fadd fast half {{.*}}, {{.*}} +; CHECK-NEXT: %[[V4:.*]] = fadd fast half {{.*}}, {{.*}} +; CHECK-NEXT: %[[V5:.*]] = fadd fast half %[[V1]], %[[V2]] +; CHECK-NEXT: %[[V6:.*]] = fadd fast half %[[V3]], %[[V4]] +; CHECK-NEXT: fadd fast half %[[V5]], %[[V6]] +define void @add_half_with_constant(half* noalias %A, half* noalias %B, half* noalias %C, half* noalias %D, half* noalias %E, half* noalias %F, half* noalias %G, half* noalias %H, half* noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds half, half* %B, i64 %indvars.iv + %0 = load half, half* %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds half, half* %C, i64 %indvars.iv + %1 = load half, half* %arrayidx.2, align 4 + %arrayidx.3 = getelementptr inbounds half, half* %D, i64 %indvars.iv + %2 = load half, half* %arrayidx.3, align 4 + %arrayidx.4 = getelementptr inbounds half, half* %E, i64 %indvars.iv + %3 = load half, half* %arrayidx.4, align 4 + %arrayidx.5 = getelementptr inbounds half, half* %F, i64 %indvars.iv + %4 = load half, half* %arrayidx.5, align 4 + %arrayidx.6 = getelementptr inbounds half, half* %G, i64 %indvars.iv + %5 = load half, half* %arrayidx.6, align 4 + %arrayidx.7 = getelementptr inbounds half, half* %H, i64 %indvars.iv + %6 = load half, half* %arrayidx.7, align 4 + %arrayidx.8 = getelementptr inbounds half, half* %I, i64 %indvars.iv + %7 = load half, half* %arrayidx.8, align 4 + %8 = fadd fast half %0, 0xH4900 + %9 = fadd fast half %8, %1 + %10 = fadd fast half %9, %2 + %11 = fadd fast half %10, %3 + %12 = fadd fast half %11, %4 + %13 = fadd fast half %12, %5 + %14 = fadd fast half %13, %6 + %15 = fadd fast half %14, %7 + %arrayidx.9 = getelementptr inbounds half, half* %A, i64 %indvars.iv + store half %15, half* %arrayidx.9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} Index: test/Transforms/TreeHeightReduction/AArch64/fp16-add.c =================================================================== --- test/Transforms/TreeHeightReduction/AArch64/fp16-add.c +++ test/Transforms/TreeHeightReduction/AArch64/fp16-add.c @@ -0,0 +1,53 @@ +; RUN: opt -O1 -enable-fp-thr -S < %s | FileCheck %s + +; CHECK-LABEL: @add_half( +; CHECK: %[[V0:.*]] = fadd fast half {{.*}}, {{.*}} +; CHECK-NEXT: %[[V1:.*]] = fadd fast half {{.*}}, {{.*}} +; CHECK-NEXT: %[[V2:.*]] = fadd fast half {{.*}}, {{.*}} +; CHECK-NEXT: %[[V3:.*]] = fadd fast half {{.*}}, {{.*}} +; CHECK-NEXT: %[[V4:.*]] = fadd fast half %[[V0]], %[[V1]] +; CHECK-NEXT: %[[V5:.*]] = fadd fast half %[[V2]], %[[V3]] +; CHECK-NEXT: fadd fast half %[[V4]], %[[V5]] +define void @add_half(half* noalias %A, half* noalias %B, half* noalias %C, half* noalias %D, half* noalias %E, half* noalias %F, half* noalias %G, half* noalias %H, half* noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds half, half* %B, i64 %indvars.iv + %0 = load half, half* %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds half, half* %C, i64 %indvars.iv + %1 = load half, half* %arrayidx.2, align 4 + %2 = fadd fast half %1, %0 + %arrayidx.3 = getelementptr inbounds half, half* %D, i64 %indvars.iv + %3 = load half, half* %arrayidx.3, align 4 + %4 = fadd fast half %2, %3 + %arrayidx.4 = getelementptr inbounds half, half* %E, i64 %indvars.iv + %5 = load half, half* %arrayidx.4, align 4 + %6 = fadd fast half %4, %5 + %arrayidx.5 = getelementptr inbounds half, half* %F, i64 %indvars.iv + %7 = load half, half* %arrayidx.5, align 4 + %8 = fadd fast half %6, %7 + %arrayidx.6 = getelementptr inbounds half, half* %G, i64 %indvars.iv + %9 = load half, half* %arrayidx.6, align 4 + %10 = fadd fast half %8, %9 + %arrayidx.7 = getelementptr inbounds half, half* %H, i64 %indvars.iv + %11 = load half, half* %arrayidx.7, align 4 + %12 = fadd fast half %10, %11 + %arrayidx.8 = getelementptr inbounds half, half* %I, i64 %indvars.iv + %13 = load half, half* %arrayidx.8, align 4 + %14 = fadd fast half %12, %13 + %arrayidx.9 = getelementptr inbounds half, half* %A, i64 %indvars.iv + store half %14, half* %arrayidx.9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} Index: test/Transforms/TreeHeightReduction/AArch64/fp16-mult.c =================================================================== --- test/Transforms/TreeHeightReduction/AArch64/fp16-mult.c +++ test/Transforms/TreeHeightReduction/AArch64/fp16-mult.c @@ -0,0 +1,53 @@ +; RUN: opt -O1 -enable-fp-thr -S < %s | FileCheck %s + +; CHECK-LABEL: @add_half( +; CHECK: %[[V0:.*]] = fmul fast half {{.*}}, {{.*}} +; CHECK-NEXT: %[[V1:.*]] = fmul fast half {{.*}}, {{.*}} +; CHECK-NEXT: %[[V2:.*]] = fmul fast half {{.*}}, {{.*}} +; CHECK-NEXT: %[[V3:.*]] = fmul fast half {{.*}}, {{.*}} +; CHECK-NEXT: %[[V4:.*]] = fmul fast half %[[V0]], %[[V1]] +; CHECK-NEXT: %[[V5:.*]] = fmul fast half %[[V2]], %[[V3]] +; CHECK-NEXT: fmul fast half %[[V4]], %[[V5]] +define void @add_half(half* noalias %A, half* noalias %B, half* noalias %C, half* noalias %D, half* noalias %E, half* noalias %F, half* noalias %G, half* noalias %H, half* noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds half, half* %B, i64 %indvars.iv + %0 = load half, half* %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds half, half* %C, i64 %indvars.iv + %1 = load half, half* %arrayidx.2, align 4 + %2 = fmul fast half %1, %0 + %arrayidx.3 = getelementptr inbounds half, half* %D, i64 %indvars.iv + %3 = load half, half* %arrayidx.3, align 4 + %4 = fmul fast half %2, %3 + %arrayidx.4 = getelementptr inbounds half, half* %E, i64 %indvars.iv + %5 = load half, half* %arrayidx.4, align 4 + %6 = fmul fast half %4, %5 + %arrayidx.5 = getelementptr inbounds half, half* %F, i64 %indvars.iv + %7 = load half, half* %arrayidx.5, align 4 + %8 = fmul fast half %6, %7 + %arrayidx.6 = getelementptr inbounds half, half* %G, i64 %indvars.iv + %9 = load half, half* %arrayidx.6, align 4 + %10 = fmul fast half %8, %9 + %arrayidx.7 = getelementptr inbounds half, half* %H, i64 %indvars.iv + %11 = load half, half* %arrayidx.7, align 4 + %12 = fmul fast half %10, %11 + %arrayidx.8 = getelementptr inbounds half, half* %I, i64 %indvars.iv + %13 = load half, half* %arrayidx.8, align 4 + %14 = fmul fast half %12, %13 + %arrayidx.9 = getelementptr inbounds half, half* %A, i64 %indvars.iv + store half %14, half* %arrayidx.9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} Index: test/Transforms/TreeHeightReduction/AArch64/fp16-sub.ll =================================================================== --- test/Transforms/TreeHeightReduction/AArch64/fp16-sub.ll +++ test/Transforms/TreeHeightReduction/AArch64/fp16-sub.ll @@ -0,0 +1,105 @@ +; RUN: opt -O1 -enable-fp-thr -S < %s | FileCheck %s + +; CHECK-LABEL: @sub_half( +; CHECK: %[[V0:.*]] = fsub fast half {{.*}}, {{.*}} +; CHECK: %[[V1:.*]] = fsub fast half %[[V0]], %{{.*}} +; CHECK: %[[V2:.*]] = fsub fast half %[[V1]], {{.*}} +; CHECK: %[[V3:.*]] = fsub fast half %[[V2]], {{.*}} +; CHECK: %[[V4:.*]] = fsub fast half %[[V3]], {{.*}} +; CHECK: %[[V5:.*]] = fsub fast half %[[V4]], {{.*}} +; CHECK: fsub fast half %[[V5]], {{.*}} +define void @sub_half(half* noalias %A, half* noalias %B, half* noalias %C, half* noalias %D, half* noalias %E, half* noalias %F, half* noalias %G, half* noalias %H, half* noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds half, half* %B, i64 %indvars.iv + %0 = load half, half* %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds half, half* %C, i64 %indvars.iv + %1 = load half, half* %arrayidx.2, align 4 + %2 = fsub fast half %0, %1 + %arrayidx.3 = getelementptr inbounds half, half* %D, i64 %indvars.iv + %3 = load half, half* %arrayidx.3, align 4 + %4 = fsub fast half %2, %3 + %arrayidx.4 = getelementptr inbounds half, half* %E, i64 %indvars.iv + %5 = load half, half* %arrayidx.4, align 4 + %6 = fsub fast half %4, %5 + %arrayidx.5 = getelementptr inbounds half, half* %F, i64 %indvars.iv + %7 = load half, half* %arrayidx.5, align 4 + %8 = fsub fast half %6, %7 + %arrayidx.6 = getelementptr inbounds half, half* %G, i64 %indvars.iv + %9 = load half, half* %arrayidx.6, align 4 + %10 = fsub fast half %8, %9 + %arrayidx.7 = getelementptr inbounds half, half* %H, i64 %indvars.iv + %11 = load half, half* %arrayidx.7, align 4 + %12 = fsub fast half %10, %11 + %arrayidx.8 = getelementptr inbounds half, half* %I, i64 %indvars.iv + %13 = load half, half* %arrayidx.8, align 4 + %14 = fsub fast half %12, %13 + %arrayidx.9 = getelementptr inbounds half, half* %A, i64 %indvars.iv + store half %14, half* %arrayidx.9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +; CHECK-LABEL: @sub_double( +; CHECK: %[[V0:.*]] = fsub fast double {{.*}}, {{.*}} +; CHECK: %[[V1:.*]] = fsub fast double %[[V0]], %{{.*}} +; CHECK: %[[V2:.*]] = fsub fast double %[[V1]], {{.*}} +; CHECK: %[[V3:.*]] = fsub fast double %[[V2]], {{.*}} +; CHECK: %[[V4:.*]] = fsub fast double %[[V3]], {{.*}} +; CHECK: %[[V5:.*]] = fsub fast double %[[V4]], {{.*}} +; CHECK: fsub fast double %[[V5]], {{.*}} +define void @sub_double(double* noalias %A, double* noalias %B, double* noalias %C, double* noalias %D, double* noalias %E, double* noalias %F, double* noalias %G, double* noalias %H, double* noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds double, double* %B, i64 %indvars.iv + %0 = load double, double* %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds double, double* %C, i64 %indvars.iv + %1 = load double, double* %arrayidx.2, align 4 + %2 = fsub fast double %0, %1 + %arrayidx.3 = getelementptr inbounds double, double* %D, i64 %indvars.iv + %3 = load double, double* %arrayidx.3, align 4 + %4 = fsub fast double %2, %3 + %arrayidx.4 = getelementptr inbounds double, double* %E, i64 %indvars.iv + %5 = load double, double* %arrayidx.4, align 4 + %6 = fsub fast double %4, %5 + %arrayidx.5 = getelementptr inbounds double, double* %F, i64 %indvars.iv + %7 = load double, double* %arrayidx.5, align 4 + %8 = fsub fast double %6, %7 + %arrayidx.6 = getelementptr inbounds double, double* %G, i64 %indvars.iv + %9 = load double, double* %arrayidx.6, align 4 + %10 = fsub fast double %8, %9 + %arrayidx.7 = getelementptr inbounds double, double* %H, i64 %indvars.iv + %11 = load double, double* %arrayidx.7, align 4 + %12 = fsub fast double %10, %11 + %arrayidx.8 = getelementptr inbounds double, double* %I, i64 %indvars.iv + %13 = load double, double* %arrayidx.8, align 4 + %14 = fsub fast double %12, %13 + %arrayidx.9 = getelementptr inbounds double, double* %A, i64 %indvars.iv + store double %14, double* %arrayidx.9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} Index: test/Transforms/TreeHeightReduction/AArch64/long-double-add-with-constant.ll =================================================================== --- test/Transforms/TreeHeightReduction/AArch64/long-double-add-with-constant.ll +++ test/Transforms/TreeHeightReduction/AArch64/long-double-add-with-constant.ll @@ -0,0 +1,55 @@ +; RUN: opt -O1 -enable-fp-thr -S < %s | FileCheck %s + +; CHECK-LABEL: @add_fp128_with_constant( +; CHECK: %[[V0:.*]] = fadd fast fp128 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V1:.*]] = fadd fast fp128 {{.*}}, %[[V0]] +; CHECK-NEXT: %[[V2:.*]] = fadd fast fp128 {{.*}}, 0xL00000000000000004002400000000000 +; CHECK-NEXT: %[[V3:.*]] = fadd fast fp128 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V4:.*]] = fadd fast fp128 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V5:.*]] = fadd fast fp128 %[[V1]], %[[V2]] +; CHECK-NEXT: %[[V6:.*]] = fadd fast fp128 %[[V3]], %[[V4]] +; CHECK-NEXT: fadd fast fp128 %[[V5]], %[[V6]] +define void @add_fp128_with_constant(fp128* noalias %A, fp128* noalias %B, fp128* noalias %C, fp128* noalias %D, fp128* noalias %E, fp128* noalias %F, fp128* noalias %G, fp128* noalias %H, fp128* noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds fp128, fp128* %B, i64 %indvars.iv + %0 = load fp128, fp128* %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds fp128, fp128* %C, i64 %indvars.iv + %1 = load fp128, fp128* %arrayidx.2, align 4 + %arrayidx.3 = getelementptr inbounds fp128, fp128* %D, i64 %indvars.iv + %2 = load fp128, fp128* %arrayidx.3, align 4 + %arrayidx.4 = getelementptr inbounds fp128, fp128* %E, i64 %indvars.iv + %3 = load fp128, fp128* %arrayidx.4, align 4 + %arrayidx.5 = getelementptr inbounds fp128, fp128* %F, i64 %indvars.iv + %4 = load fp128, fp128* %arrayidx.5, align 4 + %arrayidx.6 = getelementptr inbounds fp128, fp128* %G, i64 %indvars.iv + %5 = load fp128, fp128* %arrayidx.6, align 4 + %arrayidx.7 = getelementptr inbounds fp128, fp128* %H, i64 %indvars.iv + %6 = load fp128, fp128* %arrayidx.7, align 4 + %arrayidx.8 = getelementptr inbounds fp128, fp128* %I, i64 %indvars.iv + %7 = load fp128, fp128* %arrayidx.8, align 4 + %8 = fadd fast fp128 %0, 0xL00000000000000004002400000000000 + %9 = fadd fast fp128 %8, %1 + %10 = fadd fast fp128 %9, %2 + %11 = fadd fast fp128 %10, %3 + %12 = fadd fast fp128 %11, %4 + %13 = fadd fast fp128 %12, %5 + %14 = fadd fast fp128 %13, %6 + %15 = fadd fast fp128 %14, %7 + %arrayidx.9 = getelementptr inbounds fp128, fp128* %A, i64 %indvars.iv + store fp128 %15, fp128* %arrayidx.9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} Index: test/Transforms/TreeHeightReduction/AArch64/long-double-add.ll =================================================================== --- test/Transforms/TreeHeightReduction/AArch64/long-double-add.ll +++ test/Transforms/TreeHeightReduction/AArch64/long-double-add.ll @@ -0,0 +1,53 @@ +; RUN: opt -O1 -enable-fp-thr -S < %s | FileCheck %s + +; CHECK-LABEL: @add_fp128( +; CHECK: %[[V0:.*]] = fadd fast fp128 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V1:.*]] = fadd fast fp128 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V2:.*]] = fadd fast fp128 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V3:.*]] = fadd fast fp128 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V4:.*]] = fadd fast fp128 %[[V0]], %[[V1]] +; CHECK-NEXT: %[[V5:.*]] = fadd fast fp128 %[[V2]], %[[V3]] +; CHECK-NEXT: fadd fast fp128 %[[V4]], %[[V5]] +define void @add_fp128(fp128* noalias %A, fp128* noalias %B, fp128* noalias %C, fp128* noalias %D, fp128* noalias %E, fp128* noalias %F, fp128* noalias %G, fp128* noalias %H, fp128* noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds fp128, fp128* %B, i64 %indvars.iv + %0 = load fp128, fp128* %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds fp128, fp128* %C, i64 %indvars.iv + %1 = load fp128, fp128* %arrayidx.2, align 4 + %2 = fadd fast fp128 %1, %0 + %arrayidx.3 = getelementptr inbounds fp128, fp128* %D, i64 %indvars.iv + %3 = load fp128, fp128* %arrayidx.3, align 4 + %4 = fadd fast fp128 %2, %3 + %arrayidx.4 = getelementptr inbounds fp128, fp128* %E, i64 %indvars.iv + %5 = load fp128, fp128* %arrayidx.4, align 4 + %6 = fadd fast fp128 %4, %5 + %arrayidx.5 = getelementptr inbounds fp128, fp128* %F, i64 %indvars.iv + %7 = load fp128, fp128* %arrayidx.5, align 4 + %8 = fadd fast fp128 %6, %7 + %arrayidx.6 = getelementptr inbounds fp128, fp128* %G, i64 %indvars.iv + %9 = load fp128, fp128* %arrayidx.6, align 4 + %10 = fadd fast fp128 %8, %9 + %arrayidx.7 = getelementptr inbounds fp128, fp128* %H, i64 %indvars.iv + %11 = load fp128, fp128* %arrayidx.7, align 4 + %12 = fadd fast fp128 %10, %11 + %arrayidx.8 = getelementptr inbounds fp128, fp128* %I, i64 %indvars.iv + %13 = load fp128, fp128* %arrayidx.8, align 4 + %14 = fadd fast fp128 %12, %13 + %arrayidx.9 = getelementptr inbounds fp128, fp128* %A, i64 %indvars.iv + store fp128 %14, fp128* %arrayidx.9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} Index: test/Transforms/TreeHeightReduction/AArch64/long-double-mult.ll =================================================================== --- test/Transforms/TreeHeightReduction/AArch64/long-double-mult.ll +++ test/Transforms/TreeHeightReduction/AArch64/long-double-mult.ll @@ -0,0 +1,53 @@ +; RUN: opt -O1 -enable-fp-thr -S < %s | FileCheck %s + +; CHECK-LABEL: @add_fp128( +; CHECK: %[[V0:.*]] = fmul fast fp128 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V1:.*]] = fmul fast fp128 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V2:.*]] = fmul fast fp128 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V3:.*]] = fmul fast fp128 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V4:.*]] = fmul fast fp128 %[[V0]], %[[V1]] +; CHECK-NEXT: %[[V5:.*]] = fmul fast fp128 %[[V2]], %[[V3]] +; CHECK-NEXT: fmul fast fp128 %[[V4]], %[[V5]] +define void @add_fp128(fp128* noalias %A, fp128* noalias %B, fp128* noalias %C, fp128* noalias %D, fp128* noalias %E, fp128* noalias %F, fp128* noalias %G, fp128* noalias %H, fp128* noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds fp128, fp128* %B, i64 %indvars.iv + %0 = load fp128, fp128* %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds fp128, fp128* %C, i64 %indvars.iv + %1 = load fp128, fp128* %arrayidx.2, align 4 + %2 = fmul fast fp128 %1, %0 + %arrayidx.3 = getelementptr inbounds fp128, fp128* %D, i64 %indvars.iv + %3 = load fp128, fp128* %arrayidx.3, align 4 + %4 = fmul fast fp128 %2, %3 + %arrayidx.4 = getelementptr inbounds fp128, fp128* %E, i64 %indvars.iv + %5 = load fp128, fp128* %arrayidx.4, align 4 + %6 = fmul fast fp128 %4, %5 + %arrayidx.5 = getelementptr inbounds fp128, fp128* %F, i64 %indvars.iv + %7 = load fp128, fp128* %arrayidx.5, align 4 + %8 = fmul fast fp128 %6, %7 + %arrayidx.6 = getelementptr inbounds fp128, fp128* %G, i64 %indvars.iv + %9 = load fp128, fp128* %arrayidx.6, align 4 + %10 = fmul fast fp128 %8, %9 + %arrayidx.7 = getelementptr inbounds fp128, fp128* %H, i64 %indvars.iv + %11 = load fp128, fp128* %arrayidx.7, align 4 + %12 = fmul fast fp128 %10, %11 + %arrayidx.8 = getelementptr inbounds fp128, fp128* %I, i64 %indvars.iv + %13 = load fp128, fp128* %arrayidx.8, align 4 + %14 = fmul fast fp128 %12, %13 + %arrayidx.9 = getelementptr inbounds fp128, fp128* %A, i64 %indvars.iv + store fp128 %14, fp128* %arrayidx.9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} Index: test/Transforms/TreeHeightReduction/AArch64/long-double-sub.ll =================================================================== --- test/Transforms/TreeHeightReduction/AArch64/long-double-sub.ll +++ test/Transforms/TreeHeightReduction/AArch64/long-double-sub.ll @@ -0,0 +1,105 @@ +; RUN: opt -O1 -enable-fp-thr -S < %s | FileCheck %s + +; CHECK-LABEL: @sub_fp128( +; CHECK: %[[V0:.*]] = fsub fast fp128 {{.*}}, {{.*}} +; CHECK: %[[V1:.*]] = fsub fast fp128 %[[V0]], %{{.*}} +; CHECK: %[[V2:.*]] = fsub fast fp128 %[[V1]], {{.*}} +; CHECK: %[[V3:.*]] = fsub fast fp128 %[[V2]], {{.*}} +; CHECK: %[[V4:.*]] = fsub fast fp128 %[[V3]], {{.*}} +; CHECK: %[[V5:.*]] = fsub fast fp128 %[[V4]], {{.*}} +; CHECK: fsub fast fp128 %[[V5]], {{.*}} +define void @sub_fp128(fp128* noalias %A, fp128* noalias %B, fp128* noalias %C, fp128* noalias %D, fp128* noalias %E, fp128* noalias %F, fp128* noalias %G, fp128* noalias %H, fp128* noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds fp128, fp128* %B, i64 %indvars.iv + %0 = load fp128, fp128* %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds fp128, fp128* %C, i64 %indvars.iv + %1 = load fp128, fp128* %arrayidx.2, align 4 + %2 = fsub fast fp128 %0, %1 + %arrayidx.3 = getelementptr inbounds fp128, fp128* %D, i64 %indvars.iv + %3 = load fp128, fp128* %arrayidx.3, align 4 + %4 = fsub fast fp128 %2, %3 + %arrayidx.4 = getelementptr inbounds fp128, fp128* %E, i64 %indvars.iv + %5 = load fp128, fp128* %arrayidx.4, align 4 + %6 = fsub fast fp128 %4, %5 + %arrayidx.5 = getelementptr inbounds fp128, fp128* %F, i64 %indvars.iv + %7 = load fp128, fp128* %arrayidx.5, align 4 + %8 = fsub fast fp128 %6, %7 + %arrayidx.6 = getelementptr inbounds fp128, fp128* %G, i64 %indvars.iv + %9 = load fp128, fp128* %arrayidx.6, align 4 + %10 = fsub fast fp128 %8, %9 + %arrayidx.7 = getelementptr inbounds fp128, fp128* %H, i64 %indvars.iv + %11 = load fp128, fp128* %arrayidx.7, align 4 + %12 = fsub fast fp128 %10, %11 + %arrayidx.8 = getelementptr inbounds fp128, fp128* %I, i64 %indvars.iv + %13 = load fp128, fp128* %arrayidx.8, align 4 + %14 = fsub fast fp128 %12, %13 + %arrayidx.9 = getelementptr inbounds fp128, fp128* %A, i64 %indvars.iv + store fp128 %14, fp128* %arrayidx.9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +; CHECK-LABEL: @sub_double( +; CHECK: %[[V0:.*]] = fsub fast double {{.*}}, {{.*}} +; CHECK: %[[V1:.*]] = fsub fast double %[[V0]], %{{.*}} +; CHECK: %[[V2:.*]] = fsub fast double %[[V1]], {{.*}} +; CHECK: %[[V3:.*]] = fsub fast double %[[V2]], {{.*}} +; CHECK: %[[V4:.*]] = fsub fast double %[[V3]], {{.*}} +; CHECK: %[[V5:.*]] = fsub fast double %[[V4]], {{.*}} +; CHECK: fsub fast double %[[V5]], {{.*}} +define void @sub_double(double* noalias %A, double* noalias %B, double* noalias %C, double* noalias %D, double* noalias %E, double* noalias %F, double* noalias %G, double* noalias %H, double* noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds double, double* %B, i64 %indvars.iv + %0 = load double, double* %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds double, double* %C, i64 %indvars.iv + %1 = load double, double* %arrayidx.2, align 4 + %2 = fsub fast double %0, %1 + %arrayidx.3 = getelementptr inbounds double, double* %D, i64 %indvars.iv + %3 = load double, double* %arrayidx.3, align 4 + %4 = fsub fast double %2, %3 + %arrayidx.4 = getelementptr inbounds double, double* %E, i64 %indvars.iv + %5 = load double, double* %arrayidx.4, align 4 + %6 = fsub fast double %4, %5 + %arrayidx.5 = getelementptr inbounds double, double* %F, i64 %indvars.iv + %7 = load double, double* %arrayidx.5, align 4 + %8 = fsub fast double %6, %7 + %arrayidx.6 = getelementptr inbounds double, double* %G, i64 %indvars.iv + %9 = load double, double* %arrayidx.6, align 4 + %10 = fsub fast double %8, %9 + %arrayidx.7 = getelementptr inbounds double, double* %H, i64 %indvars.iv + %11 = load double, double* %arrayidx.7, align 4 + %12 = fsub fast double %10, %11 + %arrayidx.8 = getelementptr inbounds double, double* %I, i64 %indvars.iv + %13 = load double, double* %arrayidx.8, align 4 + %14 = fsub fast double %12, %13 + %arrayidx.9 = getelementptr inbounds double, double* %A, i64 %indvars.iv + store double %14, double* %arrayidx.9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} Index: test/Transforms/TreeHeightReduction/X86/long-double-add-with-constant.ll =================================================================== --- test/Transforms/TreeHeightReduction/X86/long-double-add-with-constant.ll +++ test/Transforms/TreeHeightReduction/X86/long-double-add-with-constant.ll @@ -0,0 +1,55 @@ +; RUN: opt -O1 -enable-fp-thr -S < %s | FileCheck %s + +; CHECK-LABEL: @add_x86_fp80_with_constant( +; CHECK: %[[V0:.*]] = fadd fast x86_fp80 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V1:.*]] = fadd fast x86_fp80 {{.*}}, %[[V0]] +; CHECK-NEXT: %[[V2:.*]] = fadd fast x86_fp80 {{.*}}, 0xK4002A000000000000000 +; CHECK-NEXT: %[[V3:.*]] = fadd fast x86_fp80 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V4:.*]] = fadd fast x86_fp80 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V5:.*]] = fadd fast x86_fp80 %[[V1]], %[[V2]] +; CHECK-NEXT: %[[V6:.*]] = fadd fast x86_fp80 %[[V3]], %[[V4]] +; CHECK-NEXT: fadd fast x86_fp80 %[[V5]], %[[V6]] +define void @add_x86_fp80_with_constant(x86_fp80* noalias %A, x86_fp80* noalias %B, x86_fp80* noalias %C, x86_fp80* noalias %D, x86_fp80* noalias %E, x86_fp80* noalias %F, x86_fp80* noalias %G, x86_fp80* noalias %H, x86_fp80* noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds x86_fp80, x86_fp80* %B, i64 %indvars.iv + %0 = load x86_fp80, x86_fp80* %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds x86_fp80, x86_fp80* %C, i64 %indvars.iv + %1 = load x86_fp80, x86_fp80* %arrayidx.2, align 4 + %arrayidx.3 = getelementptr inbounds x86_fp80, x86_fp80* %D, i64 %indvars.iv + %2 = load x86_fp80, x86_fp80* %arrayidx.3, align 4 + %arrayidx.4 = getelementptr inbounds x86_fp80, x86_fp80* %E, i64 %indvars.iv + %3 = load x86_fp80, x86_fp80* %arrayidx.4, align 4 + %arrayidx.5 = getelementptr inbounds x86_fp80, x86_fp80* %F, i64 %indvars.iv + %4 = load x86_fp80, x86_fp80* %arrayidx.5, align 4 + %arrayidx.6 = getelementptr inbounds x86_fp80, x86_fp80* %G, i64 %indvars.iv + %5 = load x86_fp80, x86_fp80* %arrayidx.6, align 4 + %arrayidx.7 = getelementptr inbounds x86_fp80, x86_fp80* %H, i64 %indvars.iv + %6 = load x86_fp80, x86_fp80* %arrayidx.7, align 4 + %arrayidx.8 = getelementptr inbounds x86_fp80, x86_fp80* %I, i64 %indvars.iv + %7 = load x86_fp80, x86_fp80* %arrayidx.8, align 4 + %8 = fadd fast x86_fp80 %0, 0xK4002A000000000000000 + %9 = fadd fast x86_fp80 %8, %1 + %10 = fadd fast x86_fp80 %9, %2 + %11 = fadd fast x86_fp80 %10, %3 + %12 = fadd fast x86_fp80 %11, %4 + %13 = fadd fast x86_fp80 %12, %5 + %14 = fadd fast x86_fp80 %13, %6 + %15 = fadd fast x86_fp80 %14, %7 + %arrayidx.9 = getelementptr inbounds x86_fp80, x86_fp80* %A, i64 %indvars.iv + store x86_fp80 %15, x86_fp80* %arrayidx.9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} Index: test/Transforms/TreeHeightReduction/X86/long-double-add.ll =================================================================== --- test/Transforms/TreeHeightReduction/X86/long-double-add.ll +++ test/Transforms/TreeHeightReduction/X86/long-double-add.ll @@ -0,0 +1,53 @@ +; RUN: opt -O1 -enable-fp-thr -S < %s | FileCheck %s + +; CHECK-LABEL: @add_x86_fp80( +; CHECK: %[[V0:.*]] = fadd fast x86_fp80 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V1:.*]] = fadd fast x86_fp80 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V2:.*]] = fadd fast x86_fp80 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V3:.*]] = fadd fast x86_fp80 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V4:.*]] = fadd fast x86_fp80 %[[V0]], %[[V1]] +; CHECK-NEXT: %[[V5:.*]] = fadd fast x86_fp80 %[[V2]], %[[V3]] +; CHECK-NEXT: fadd fast x86_fp80 %[[V4]], %[[V5]] +define void @add_x86_fp80(x86_fp80* noalias %A, x86_fp80* noalias %B, x86_fp80* noalias %C, x86_fp80* noalias %D, x86_fp80* noalias %E, x86_fp80* noalias %F, x86_fp80* noalias %G, x86_fp80* noalias %H, x86_fp80* noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds x86_fp80, x86_fp80* %B, i64 %indvars.iv + %0 = load x86_fp80, x86_fp80* %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds x86_fp80, x86_fp80* %C, i64 %indvars.iv + %1 = load x86_fp80, x86_fp80* %arrayidx.2, align 4 + %2 = fadd fast x86_fp80 %1, %0 + %arrayidx.3 = getelementptr inbounds x86_fp80, x86_fp80* %D, i64 %indvars.iv + %3 = load x86_fp80, x86_fp80* %arrayidx.3, align 4 + %4 = fadd fast x86_fp80 %2, %3 + %arrayidx.4 = getelementptr inbounds x86_fp80, x86_fp80* %E, i64 %indvars.iv + %5 = load x86_fp80, x86_fp80* %arrayidx.4, align 4 + %6 = fadd fast x86_fp80 %4, %5 + %arrayidx.5 = getelementptr inbounds x86_fp80, x86_fp80* %F, i64 %indvars.iv + %7 = load x86_fp80, x86_fp80* %arrayidx.5, align 4 + %8 = fadd fast x86_fp80 %6, %7 + %arrayidx.6 = getelementptr inbounds x86_fp80, x86_fp80* %G, i64 %indvars.iv + %9 = load x86_fp80, x86_fp80* %arrayidx.6, align 4 + %10 = fadd fast x86_fp80 %8, %9 + %arrayidx.7 = getelementptr inbounds x86_fp80, x86_fp80* %H, i64 %indvars.iv + %11 = load x86_fp80, x86_fp80* %arrayidx.7, align 4 + %12 = fadd fast x86_fp80 %10, %11 + %arrayidx.8 = getelementptr inbounds x86_fp80, x86_fp80* %I, i64 %indvars.iv + %13 = load x86_fp80, x86_fp80* %arrayidx.8, align 4 + %14 = fadd fast x86_fp80 %12, %13 + %arrayidx.9 = getelementptr inbounds x86_fp80, x86_fp80* %A, i64 %indvars.iv + store x86_fp80 %14, x86_fp80* %arrayidx.9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} Index: test/Transforms/TreeHeightReduction/X86/long-double-mult.c =================================================================== --- test/Transforms/TreeHeightReduction/X86/long-double-mult.c +++ test/Transforms/TreeHeightReduction/X86/long-double-mult.c @@ -0,0 +1,53 @@ +; RUN: opt -O1 -enable-fp-thr -S < %s | FileCheck %s + +; CHECK-LABEL: @add_x86_fp80( +; CHECK: %[[V0:.*]] = fmul fast x86_fp80 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V1:.*]] = fmul fast x86_fp80 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V2:.*]] = fmul fast x86_fp80 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V3:.*]] = fmul fast x86_fp80 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V4:.*]] = fmul fast x86_fp80 %[[V0]], %[[V1]] +; CHECK-NEXT: %[[V5:.*]] = fmul fast x86_fp80 %[[V2]], %[[V3]] +; CHECK-NEXT: fmul fast x86_fp80 %[[V4]], %[[V5]] +define void @add_x86_fp80(x86_fp80* noalias %A, x86_fp80* noalias %B, x86_fp80* noalias %C, x86_fp80* noalias %D, x86_fp80* noalias %E, x86_fp80* noalias %F, x86_fp80* noalias %G, x86_fp80* noalias %H, x86_fp80* noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds x86_fp80, x86_fp80* %B, i64 %indvars.iv + %0 = load x86_fp80, x86_fp80* %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds x86_fp80, x86_fp80* %C, i64 %indvars.iv + %1 = load x86_fp80, x86_fp80* %arrayidx.2, align 4 + %2 = fmul fast x86_fp80 %1, %0 + %arrayidx.3 = getelementptr inbounds x86_fp80, x86_fp80* %D, i64 %indvars.iv + %3 = load x86_fp80, x86_fp80* %arrayidx.3, align 4 + %4 = fmul fast x86_fp80 %2, %3 + %arrayidx.4 = getelementptr inbounds x86_fp80, x86_fp80* %E, i64 %indvars.iv + %5 = load x86_fp80, x86_fp80* %arrayidx.4, align 4 + %6 = fmul fast x86_fp80 %4, %5 + %arrayidx.5 = getelementptr inbounds x86_fp80, x86_fp80* %F, i64 %indvars.iv + %7 = load x86_fp80, x86_fp80* %arrayidx.5, align 4 + %8 = fmul fast x86_fp80 %6, %7 + %arrayidx.6 = getelementptr inbounds x86_fp80, x86_fp80* %G, i64 %indvars.iv + %9 = load x86_fp80, x86_fp80* %arrayidx.6, align 4 + %10 = fmul fast x86_fp80 %8, %9 + %arrayidx.7 = getelementptr inbounds x86_fp80, x86_fp80* %H, i64 %indvars.iv + %11 = load x86_fp80, x86_fp80* %arrayidx.7, align 4 + %12 = fmul fast x86_fp80 %10, %11 + %arrayidx.8 = getelementptr inbounds x86_fp80, x86_fp80* %I, i64 %indvars.iv + %13 = load x86_fp80, x86_fp80* %arrayidx.8, align 4 + %14 = fmul fast x86_fp80 %12, %13 + %arrayidx.9 = getelementptr inbounds x86_fp80, x86_fp80* %A, i64 %indvars.iv + store x86_fp80 %14, x86_fp80* %arrayidx.9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} Index: test/Transforms/TreeHeightReduction/X86/long-double-sub-only.ll =================================================================== --- test/Transforms/TreeHeightReduction/X86/long-double-sub-only.ll +++ test/Transforms/TreeHeightReduction/X86/long-double-sub-only.ll @@ -0,0 +1,105 @@ +; RUN: opt -O1 -enable-fp-thr -S < %s | FileCheck %s + +; CHECK-LABEL: @sub_x86_fp80( +; CHECK: %[[V0:.*]] = fsub fast x86_fp80 {{.*}}, {{.*}} +; CHECK: %[[V1:.*]] = fsub fast x86_fp80 %[[V0]], %{{.*}} +; CHECK: %[[V2:.*]] = fsub fast x86_fp80 %[[V1]], {{.*}} +; CHECK: %[[V3:.*]] = fsub fast x86_fp80 %[[V2]], {{.*}} +; CHECK: %[[V4:.*]] = fsub fast x86_fp80 %[[V3]], {{.*}} +; CHECK: %[[V5:.*]] = fsub fast x86_fp80 %[[V4]], {{.*}} +; CHECK: fsub fast x86_fp80 %[[V5]], {{.*}} +define void @sub_x86_fp80(x86_fp80* noalias %A, x86_fp80* noalias %B, x86_fp80* noalias %C, x86_fp80* noalias %D, x86_fp80* noalias %E, x86_fp80* noalias %F, x86_fp80* noalias %G, x86_fp80* noalias %H, x86_fp80* noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds x86_fp80, x86_fp80* %B, i64 %indvars.iv + %0 = load x86_fp80, x86_fp80* %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds x86_fp80, x86_fp80* %C, i64 %indvars.iv + %1 = load x86_fp80, x86_fp80* %arrayidx.2, align 4 + %2 = fsub fast x86_fp80 %0, %1 + %arrayidx.3 = getelementptr inbounds x86_fp80, x86_fp80* %D, i64 %indvars.iv + %3 = load x86_fp80, x86_fp80* %arrayidx.3, align 4 + %4 = fsub fast x86_fp80 %2, %3 + %arrayidx.4 = getelementptr inbounds x86_fp80, x86_fp80* %E, i64 %indvars.iv + %5 = load x86_fp80, x86_fp80* %arrayidx.4, align 4 + %6 = fsub fast x86_fp80 %4, %5 + %arrayidx.5 = getelementptr inbounds x86_fp80, x86_fp80* %F, i64 %indvars.iv + %7 = load x86_fp80, x86_fp80* %arrayidx.5, align 4 + %8 = fsub fast x86_fp80 %6, %7 + %arrayidx.6 = getelementptr inbounds x86_fp80, x86_fp80* %G, i64 %indvars.iv + %9 = load x86_fp80, x86_fp80* %arrayidx.6, align 4 + %10 = fsub fast x86_fp80 %8, %9 + %arrayidx.7 = getelementptr inbounds x86_fp80, x86_fp80* %H, i64 %indvars.iv + %11 = load x86_fp80, x86_fp80* %arrayidx.7, align 4 + %12 = fsub fast x86_fp80 %10, %11 + %arrayidx.8 = getelementptr inbounds x86_fp80, x86_fp80* %I, i64 %indvars.iv + %13 = load x86_fp80, x86_fp80* %arrayidx.8, align 4 + %14 = fsub fast x86_fp80 %12, %13 + %arrayidx.9 = getelementptr inbounds x86_fp80, x86_fp80* %A, i64 %indvars.iv + store x86_fp80 %14, x86_fp80* %arrayidx.9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +; CHECK-LABEL: @sub_double( +; CHECK: %[[V0:.*]] = fsub fast double {{.*}}, {{.*}} +; CHECK: %[[V1:.*]] = fsub fast double %[[V0]], %{{.*}} +; CHECK: %[[V2:.*]] = fsub fast double %[[V1]], {{.*}} +; CHECK: %[[V3:.*]] = fsub fast double %[[V2]], {{.*}} +; CHECK: %[[V4:.*]] = fsub fast double %[[V3]], {{.*}} +; CHECK: %[[V5:.*]] = fsub fast double %[[V4]], {{.*}} +; CHECK: fsub fast double %[[V5]], {{.*}} +define void @sub_double(double* noalias %A, double* noalias %B, double* noalias %C, double* noalias %D, double* noalias %E, double* noalias %F, double* noalias %G, double* noalias %H, double* noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds double, double* %B, i64 %indvars.iv + %0 = load double, double* %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds double, double* %C, i64 %indvars.iv + %1 = load double, double* %arrayidx.2, align 4 + %2 = fsub fast double %0, %1 + %arrayidx.3 = getelementptr inbounds double, double* %D, i64 %indvars.iv + %3 = load double, double* %arrayidx.3, align 4 + %4 = fsub fast double %2, %3 + %arrayidx.4 = getelementptr inbounds double, double* %E, i64 %indvars.iv + %5 = load double, double* %arrayidx.4, align 4 + %6 = fsub fast double %4, %5 + %arrayidx.5 = getelementptr inbounds double, double* %F, i64 %indvars.iv + %7 = load double, double* %arrayidx.5, align 4 + %8 = fsub fast double %6, %7 + %arrayidx.6 = getelementptr inbounds double, double* %G, i64 %indvars.iv + %9 = load double, double* %arrayidx.6, align 4 + %10 = fsub fast double %8, %9 + %arrayidx.7 = getelementptr inbounds double, double* %H, i64 %indvars.iv + %11 = load double, double* %arrayidx.7, align 4 + %12 = fsub fast double %10, %11 + %arrayidx.8 = getelementptr inbounds double, double* %I, i64 %indvars.iv + %13 = load double, double* %arrayidx.8, align 4 + %14 = fsub fast double %12, %13 + %arrayidx.9 = getelementptr inbounds double, double* %A, i64 %indvars.iv + store double %14, double* %arrayidx.9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} Index: test/Transforms/TreeHeightReduction/floating-point-add-only.ll =================================================================== --- test/Transforms/TreeHeightReduction/floating-point-add-only.ll +++ test/Transforms/TreeHeightReduction/floating-point-add-only.ll @@ -0,0 +1,105 @@ +; RUN: opt -O1 -enable-fp-thr -S < %s | FileCheck %s + +; CHECK-LABEL: @add_float( +; CHECK: %[[V0:.*]] = fadd fast float {{.*}}, {{.*}} +; CHECK-NEXT: %[[V1:.*]] = fadd fast float {{.*}}, {{.*}} +; CHECK-NEXT: %[[V2:.*]] = fadd fast float {{.*}}, {{.*}} +; CHECK-NEXT: %[[V3:.*]] = fadd fast float {{.*}}, {{.*}} +; CHECK-NEXT: %[[V4:.*]] = fadd fast float %[[V0]], %[[V1]] +; CHECK-NEXT: %[[V5:.*]] = fadd fast float %[[V2]], %[[V3]] +; CHECK-NEXT: fadd fast float %[[V4]], %[[V5]] +define void @add_float(float* noalias %A, float* noalias %B, float* noalias %C, float* noalias %D, float* noalias %E, float* noalias %F, float* noalias %G, float* noalias %H, float* noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds float, float* %B, i64 %indvars.iv + %0 = load float, float* %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds float, float* %C, i64 %indvars.iv + %1 = load float, float* %arrayidx.2, align 4 + %2 = fadd fast float %1, %0 + %arrayidx.3 = getelementptr inbounds float, float* %D, i64 %indvars.iv + %3 = load float, float* %arrayidx.3, align 4 + %4 = fadd fast float %2, %3 + %arrayidx.4 = getelementptr inbounds float, float* %E, i64 %indvars.iv + %5 = load float, float* %arrayidx.4, align 4 + %6 = fadd fast float %4, %5 + %arrayidx.5 = getelementptr inbounds float, float* %F, i64 %indvars.iv + %7 = load float, float* %arrayidx.5, align 4 + %8 = fadd fast float %6, %7 + %arrayidx.6 = getelementptr inbounds float, float* %G, i64 %indvars.iv + %9 = load float, float* %arrayidx.6, align 4 + %10 = fadd fast float %8, %9 + %arrayidx.7 = getelementptr inbounds float, float* %H, i64 %indvars.iv + %11 = load float, float* %arrayidx.7, align 4 + %12 = fadd fast float %10, %11 + %arrayidx.8 = getelementptr inbounds float, float* %I, i64 %indvars.iv + %13 = load float, float* %arrayidx.8, align 4 + %14 = fadd fast float %12, %13 + %arrayidx.9 = getelementptr inbounds float, float* %A, i64 %indvars.iv + store float %14, float* %arrayidx.9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +; CHECK-LABEL: @add_double( +; CHECK: %[[V0:.*]] = fadd fast double {{.*}}, {{.*}} +; CHECK-NEXT: %[[V1:.*]] = fadd fast double {{.*}}, {{.*}} +; CHECK-NEXT: %[[V2:.*]] = fadd fast double {{.*}}, {{.*}} +; CHECK-NEXT: %[[V3:.*]] = fadd fast double {{.*}}, {{.*}} +; CHECK-NEXT: %[[V4:.*]] = fadd fast double %[[V0]], %[[V1]] +; CHECK-NEXT: %[[V5:.*]] = fadd fast double %[[V2]], %[[V3]] +; CHECK-NEXT: fadd fast double %[[V4]], %[[V5]] +define void @add_double(double* noalias %A, double* noalias %B, double* noalias %C, double* noalias %D, double* noalias %E, double* noalias %F, double* noalias %G, double* noalias %H, double* noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds double, double* %B, i64 %indvars.iv + %0 = load double, double* %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds double, double* %C, i64 %indvars.iv + %1 = load double, double* %arrayidx.2, align 4 + %2 = fadd fast double %1, %0 + %arrayidx.3 = getelementptr inbounds double, double* %D, i64 %indvars.iv + %3 = load double, double* %arrayidx.3, align 4 + %4 = fadd fast double %2, %3 + %arrayidx.4 = getelementptr inbounds double, double* %E, i64 %indvars.iv + %5 = load double, double* %arrayidx.4, align 4 + %6 = fadd fast double %4, %5 + %arrayidx.5 = getelementptr inbounds double, double* %F, i64 %indvars.iv + %7 = load double, double* %arrayidx.5, align 4 + %8 = fadd fast double %6, %7 + %arrayidx.6 = getelementptr inbounds double, double* %G, i64 %indvars.iv + %9 = load double, double* %arrayidx.6, align 4 + %10 = fadd fast double %8, %9 + %arrayidx.7 = getelementptr inbounds double, double* %H, i64 %indvars.iv + %11 = load double, double* %arrayidx.7, align 4 + %12 = fadd fast double %10, %11 + %arrayidx.8 = getelementptr inbounds double, double* %I, i64 %indvars.iv + %13 = load double, double* %arrayidx.8, align 4 + %14 = fadd fast double %12, %13 + %arrayidx.9 = getelementptr inbounds double, double* %A, i64 %indvars.iv + store double %14, double* %arrayidx.9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} Index: test/Transforms/TreeHeightReduction/floating-point-add-with-constant.ll =================================================================== --- test/Transforms/TreeHeightReduction/floating-point-add-with-constant.ll +++ test/Transforms/TreeHeightReduction/floating-point-add-with-constant.ll @@ -0,0 +1,109 @@ +; RUN: opt -O1 -enable-fp-thr -S < %s | FileCheck %s + +; CHECK-LABEL: @add_float_with_constant( +; CHECK: %[[V0:.*]] = fadd fast float {{.*}}, {{.*}} +; CHECK-NEXT: %[[V1:.*]] = fadd fast float {{.*}}, %[[V0]] +; CHECK-NEXT: %[[V2:.*]] = fadd fast float {{.*}}, 1.000000e+01 +; CHECK-NEXT: %[[V3:.*]] = fadd fast float {{.*}}, {{.*}} +; CHECK-NEXT: %[[V4:.*]] = fadd fast float {{.*}}, {{.*}} +; CHECK-NEXT: %[[V5:.*]] = fadd fast float %[[V1]], %[[V2]] +; CHECK-NEXT: %[[V6:.*]] = fadd fast float %[[V3]], %[[V4]] +; CHECK-NEXT: fadd fast float %[[V5]], %[[V6]] +define void @add_float_with_constant(float* noalias %A, float* noalias %B, float* noalias %C, float* noalias %D, float* noalias %E, float* noalias %F, float* noalias %G, float* noalias %H, float* noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds float, float* %B, i64 %indvars.iv + %0 = load float, float* %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds float, float* %C, i64 %indvars.iv + %1 = load float, float* %arrayidx.2, align 4 + %arrayidx.3 = getelementptr inbounds float, float* %D, i64 %indvars.iv + %2 = load float, float* %arrayidx.3, align 4 + %arrayidx.4 = getelementptr inbounds float, float* %E, i64 %indvars.iv + %3 = load float, float* %arrayidx.4, align 4 + %arrayidx.5 = getelementptr inbounds float, float* %F, i64 %indvars.iv + %4 = load float, float* %arrayidx.5, align 4 + %arrayidx.6 = getelementptr inbounds float, float* %G, i64 %indvars.iv + %5 = load float, float* %arrayidx.6, align 4 + %arrayidx.7 = getelementptr inbounds float, float* %H, i64 %indvars.iv + %6 = load float, float* %arrayidx.7, align 4 + %arrayidx.8 = getelementptr inbounds float, float* %I, i64 %indvars.iv + %7 = load float, float* %arrayidx.8, align 4 + %8 = fadd fast float %0, 1.000000e+01 + %9 = fadd fast float %8, %1 + %10 = fadd fast float %9, %2 + %11 = fadd fast float %10, %3 + %12 = fadd fast float %11, %4 + %13 = fadd fast float %12, %5 + %14 = fadd fast float %13, %6 + %15 = fadd fast float %14, %7 + %arrayidx.9 = getelementptr inbounds float, float* %A, i64 %indvars.iv + store float %15, float* %arrayidx.9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +; CHECK-LABEL: @add_double_with_constant( +; CHECK: %[[V0:.*]] = fadd fast double {{.*}}, {{.*}} +; CHECK-NEXT: %[[V1:.*]] = fadd fast double {{.*}}, %[[V0]] +; CHECK-NEXT: %[[V2:.*]] = fadd fast double {{.*}}, 1.000000e+01 +; CHECK-NEXT: %[[V3:.*]] = fadd fast double {{.*}}, {{.*}} +; CHECK-NEXT: %[[V4:.*]] = fadd fast double {{.*}}, {{.*}} +; CHECK-NEXT: %[[V5:.*]] = fadd fast double %[[V1]], %[[V2]] +; CHECK-NEXT: %[[V6:.*]] = fadd fast double %[[V3]], %[[V4]] +; CHECK-NEXT: fadd fast double %[[V5]], %[[V6]] +define void @add_double_with_constant(double* noalias %A, double* noalias %B, double* noalias %C, double* noalias %D, double* noalias %E, double* noalias %F, double* noalias %G, double* noalias %H, double* noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds double, double* %B, i64 %indvars.iv + %0 = load double, double* %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds double, double* %C, i64 %indvars.iv + %1 = load double, double* %arrayidx.2, align 4 + %arrayidx.3 = getelementptr inbounds double, double* %D, i64 %indvars.iv + %2 = load double, double* %arrayidx.3, align 4 + %arrayidx.4 = getelementptr inbounds double, double* %E, i64 %indvars.iv + %3 = load double, double* %arrayidx.4, align 4 + %arrayidx.5 = getelementptr inbounds double, double* %F, i64 %indvars.iv + %4 = load double, double* %arrayidx.5, align 4 + %arrayidx.6 = getelementptr inbounds double, double* %G, i64 %indvars.iv + %5 = load double, double* %arrayidx.6, align 4 + %arrayidx.7 = getelementptr inbounds double, double* %H, i64 %indvars.iv + %6 = load double, double* %arrayidx.7, align 4 + %arrayidx.8 = getelementptr inbounds double, double* %I, i64 %indvars.iv + %7 = load double, double* %arrayidx.8, align 4 + %8 = fadd fast double %0, 1.000000e+01 + %9 = fadd fast double %8, %1 + %10 = fadd fast double %9, %2 + %11 = fadd fast double %10, %3 + %12 = fadd fast double %11, %4 + %13 = fadd fast double %12, %5 + %14 = fadd fast double %13, %6 + %15 = fadd fast double %14, %7 + %arrayidx.9 = getelementptr inbounds double, double* %A, i64 %indvars.iv + store double %15, double* %arrayidx.9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} Index: test/Transforms/TreeHeightReduction/floating-point-mult-only.ll =================================================================== --- test/Transforms/TreeHeightReduction/floating-point-mult-only.ll +++ test/Transforms/TreeHeightReduction/floating-point-mult-only.ll @@ -0,0 +1,105 @@ +; RUN: opt -O1 -enable-fp-thr -S < %s | FileCheck %s + +; CHECK-LABEL: @add_float( +; CHECK: %[[V0:.*]] = fmul fast float {{.*}}, {{.*}} +; CHECK-NEXT: %[[V1:.*]] = fmul fast float {{.*}}, {{.*}} +; CHECK-NEXT: %[[V2:.*]] = fmul fast float {{.*}}, {{.*}} +; CHECK-NEXT: %[[V3:.*]] = fmul fast float {{.*}}, {{.*}} +; CHECK-NEXT: %[[V4:.*]] = fmul fast float %[[V0]], %[[V1]] +; CHECK-NEXT: %[[V5:.*]] = fmul fast float %[[V2]], %[[V3]] +; CHECK-NEXT: fmul fast float %[[V4]], %[[V5]] +define void @add_float(float* noalias %A, float* noalias %B, float* noalias %C, float* noalias %D, float* noalias %E, float* noalias %F, float* noalias %G, float* noalias %H, float* noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds float, float* %B, i64 %indvars.iv + %0 = load float, float* %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds float, float* %C, i64 %indvars.iv + %1 = load float, float* %arrayidx.2, align 4 + %2 = fmul fast float %1, %0 + %arrayidx.3 = getelementptr inbounds float, float* %D, i64 %indvars.iv + %3 = load float, float* %arrayidx.3, align 4 + %4 = fmul fast float %2, %3 + %arrayidx.4 = getelementptr inbounds float, float* %E, i64 %indvars.iv + %5 = load float, float* %arrayidx.4, align 4 + %6 = fmul fast float %4, %5 + %arrayidx.5 = getelementptr inbounds float, float* %F, i64 %indvars.iv + %7 = load float, float* %arrayidx.5, align 4 + %8 = fmul fast float %6, %7 + %arrayidx.6 = getelementptr inbounds float, float* %G, i64 %indvars.iv + %9 = load float, float* %arrayidx.6, align 4 + %10 = fmul fast float %8, %9 + %arrayidx.7 = getelementptr inbounds float, float* %H, i64 %indvars.iv + %11 = load float, float* %arrayidx.7, align 4 + %12 = fmul fast float %10, %11 + %arrayidx.8 = getelementptr inbounds float, float* %I, i64 %indvars.iv + %13 = load float, float* %arrayidx.8, align 4 + %14 = fmul fast float %12, %13 + %arrayidx.9 = getelementptr inbounds float, float* %A, i64 %indvars.iv + store float %14, float* %arrayidx.9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +; CHECK-LABEL: @add_double( +; CHECK: %[[V0:.*]] = fmul fast double {{.*}}, {{.*}} +; CHECK-NEXT: %[[V1:.*]] = fmul fast double {{.*}}, {{.*}} +; CHECK-NEXT: %[[V2:.*]] = fmul fast double {{.*}}, {{.*}} +; CHECK-NEXT: %[[V3:.*]] = fmul fast double {{.*}}, {{.*}} +; CHECK-NEXT: %[[V4:.*]] = fmul fast double %[[V0]], %[[V1]] +; CHECK-NEXT: %[[V5:.*]] = fmul fast double %[[V2]], %[[V3]] +; CHECK-NEXT: fmul fast double %[[V4]], %[[V5]] +define void @add_double(double* noalias %A, double* noalias %B, double* noalias %C, double* noalias %D, double* noalias %E, double* noalias %F, double* noalias %G, double* noalias %H, double* noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds double, double* %B, i64 %indvars.iv + %0 = load double, double* %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds double, double* %C, i64 %indvars.iv + %1 = load double, double* %arrayidx.2, align 4 + %2 = fmul fast double %1, %0 + %arrayidx.3 = getelementptr inbounds double, double* %D, i64 %indvars.iv + %3 = load double, double* %arrayidx.3, align 4 + %4 = fmul fast double %2, %3 + %arrayidx.4 = getelementptr inbounds double, double* %E, i64 %indvars.iv + %5 = load double, double* %arrayidx.4, align 4 + %6 = fmul fast double %4, %5 + %arrayidx.5 = getelementptr inbounds double, double* %F, i64 %indvars.iv + %7 = load double, double* %arrayidx.5, align 4 + %8 = fmul fast double %6, %7 + %arrayidx.6 = getelementptr inbounds double, double* %G, i64 %indvars.iv + %9 = load double, double* %arrayidx.6, align 4 + %10 = fmul fast double %8, %9 + %arrayidx.7 = getelementptr inbounds double, double* %H, i64 %indvars.iv + %11 = load double, double* %arrayidx.7, align 4 + %12 = fmul fast double %10, %11 + %arrayidx.8 = getelementptr inbounds double, double* %I, i64 %indvars.iv + %13 = load double, double* %arrayidx.8, align 4 + %14 = fmul fast double %12, %13 + %arrayidx.9 = getelementptr inbounds double, double* %A, i64 %indvars.iv + store double %14, double* %arrayidx.9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} Index: test/Transforms/TreeHeightReduction/floating-point-sub-only.ll =================================================================== --- test/Transforms/TreeHeightReduction/floating-point-sub-only.ll +++ test/Transforms/TreeHeightReduction/floating-point-sub-only.ll @@ -0,0 +1,105 @@ +; RUN: opt -O1 -enable-fp-thr -S < %s | FileCheck %s + +; CHECK-LABEL: @sub_float( +; CHECK: %[[V0:.*]] = fsub fast float {{.*}}, {{.*}} +; CHECK: %[[V1:.*]] = fsub fast float %[[V0]], %{{.*}} +; CHECK: %[[V2:.*]] = fsub fast float %[[V1]], {{.*}} +; CHECK: %[[V3:.*]] = fsub fast float %[[V2]], {{.*}} +; CHECK: %[[V4:.*]] = fsub fast float %[[V3]], {{.*}} +; CHECK: %[[V5:.*]] = fsub fast float %[[V4]], {{.*}} +; CHECK: fsub fast float %[[V5]], {{.*}} +define void @sub_float(float* noalias %A, float* noalias %B, float* noalias %C, float* noalias %D, float* noalias %E, float* noalias %F, float* noalias %G, float* noalias %H, float* noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds float, float* %B, i64 %indvars.iv + %0 = load float, float* %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds float, float* %C, i64 %indvars.iv + %1 = load float, float* %arrayidx.2, align 4 + %2 = fsub fast float %0, %1 + %arrayidx.3 = getelementptr inbounds float, float* %D, i64 %indvars.iv + %3 = load float, float* %arrayidx.3, align 4 + %4 = fsub fast float %2, %3 + %arrayidx.4 = getelementptr inbounds float, float* %E, i64 %indvars.iv + %5 = load float, float* %arrayidx.4, align 4 + %6 = fsub fast float %4, %5 + %arrayidx.5 = getelementptr inbounds float, float* %F, i64 %indvars.iv + %7 = load float, float* %arrayidx.5, align 4 + %8 = fsub fast float %6, %7 + %arrayidx.6 = getelementptr inbounds float, float* %G, i64 %indvars.iv + %9 = load float, float* %arrayidx.6, align 4 + %10 = fsub fast float %8, %9 + %arrayidx.7 = getelementptr inbounds float, float* %H, i64 %indvars.iv + %11 = load float, float* %arrayidx.7, align 4 + %12 = fsub fast float %10, %11 + %arrayidx.8 = getelementptr inbounds float, float* %I, i64 %indvars.iv + %13 = load float, float* %arrayidx.8, align 4 + %14 = fsub fast float %12, %13 + %arrayidx.9 = getelementptr inbounds float, float* %A, i64 %indvars.iv + store float %14, float* %arrayidx.9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +; CHECK-LABEL: @sub_double( +; CHECK: %[[V0:.*]] = fsub fast double {{.*}}, {{.*}} +; CHECK: %[[V1:.*]] = fsub fast double %[[V0]], %{{.*}} +; CHECK: %[[V2:.*]] = fsub fast double %[[V1]], {{.*}} +; CHECK: %[[V3:.*]] = fsub fast double %[[V2]], {{.*}} +; CHECK: %[[V4:.*]] = fsub fast double %[[V3]], {{.*}} +; CHECK: %[[V5:.*]] = fsub fast double %[[V4]], {{.*}} +; CHECK: fsub fast double %[[V5]], {{.*}} +define void @sub_double(double* noalias %A, double* noalias %B, double* noalias %C, double* noalias %D, double* noalias %E, double* noalias %F, double* noalias %G, double* noalias %H, double* noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds double, double* %B, i64 %indvars.iv + %0 = load double, double* %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds double, double* %C, i64 %indvars.iv + %1 = load double, double* %arrayidx.2, align 4 + %2 = fsub fast double %0, %1 + %arrayidx.3 = getelementptr inbounds double, double* %D, i64 %indvars.iv + %3 = load double, double* %arrayidx.3, align 4 + %4 = fsub fast double %2, %3 + %arrayidx.4 = getelementptr inbounds double, double* %E, i64 %indvars.iv + %5 = load double, double* %arrayidx.4, align 4 + %6 = fsub fast double %4, %5 + %arrayidx.5 = getelementptr inbounds double, double* %F, i64 %indvars.iv + %7 = load double, double* %arrayidx.5, align 4 + %8 = fsub fast double %6, %7 + %arrayidx.6 = getelementptr inbounds double, double* %G, i64 %indvars.iv + %9 = load double, double* %arrayidx.6, align 4 + %10 = fsub fast double %8, %9 + %arrayidx.7 = getelementptr inbounds double, double* %H, i64 %indvars.iv + %11 = load double, double* %arrayidx.7, align 4 + %12 = fsub fast double %10, %11 + %arrayidx.8 = getelementptr inbounds double, double* %I, i64 %indvars.iv + %13 = load double, double* %arrayidx.8, align 4 + %14 = fsub fast double %12, %13 + %arrayidx.9 = getelementptr inbounds double, double* %A, i64 %indvars.iv + store double %14, double* %arrayidx.9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} Index: test/Transforms/TreeHeightReduction/integer-add-only.ll =================================================================== --- test/Transforms/TreeHeightReduction/integer-add-only.ll +++ test/Transforms/TreeHeightReduction/integer-add-only.ll @@ -0,0 +1,209 @@ +; RUN: opt -O1 -S < %s | FileCheck %s + +; CHECK-LABEL: @add_i8( +; CHECK: %[[V0:.*]] = add i8 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V1:.*]] = add i8 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V2:.*]] = add i8 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V3:.*]] = add i8 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V4:.*]] = add i8 %[[V0]], %[[V1]] +; CHECK-NEXT: %[[V5:.*]] = add i8 %[[V2]], %[[V3]] +; CHECK-NEXT: add i8 %[[V4]], %[[V5]] +define void @add_i8(i8* noalias %A, i8* noalias %B, i8* noalias %C, i8* noalias %D, i8* noalias %E, i8* noalias %F, i8* noalias %G, i8* noalias %H, i8* noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds i8, i8* %B, i64 %indvars.iv + %0 = load i8, i8* %arrayidx.1, align 1 + %arrayidx.2 = getelementptr inbounds i8, i8* %C, i64 %indvars.iv + %1 = load i8, i8* %arrayidx.2, align 1 + %2 = add i8 %1, %0 + %arrayidx.3 = getelementptr inbounds i8, i8* %D, i64 %indvars.iv + %3 = load i8, i8* %arrayidx.3, align 1 + %4 = add i8 %2, %3 + %arrayidx.4 = getelementptr inbounds i8, i8* %E, i64 %indvars.iv + %5 = load i8, i8* %arrayidx.4, align 1 + %6 = add i8 %4, %5 + %arrayidx.5 = getelementptr inbounds i8, i8* %F, i64 %indvars.iv + %7 = load i8, i8* %arrayidx.5, align 1 + %8 = add i8 %6, %7 + %arrayidx.6 = getelementptr inbounds i8, i8* %G, i64 %indvars.iv + %9 = load i8, i8* %arrayidx.6, align 1 + %10 = add i8 %8, %9 + %arrayidx.7 = getelementptr inbounds i8, i8* %H, i64 %indvars.iv + %11 = load i8, i8* %arrayidx.7, align 1 + %12 = add i8 %10, %11 + %arrayidx.8 = getelementptr inbounds i8, i8* %I, i64 %indvars.iv + %13 = load i8, i8* %arrayidx.8, align 1 + %14 = add i8 %12, %13 + %arrayidx.9 = getelementptr inbounds i8, i8* %A, i64 %indvars.iv + store i8 %14, i8* %arrayidx.9, align 1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +; CHECK-LABEL: @add_i16( +; CHECK: %[[V0:.*]] = add i16 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V1:.*]] = add i16 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V2:.*]] = add i16 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V3:.*]] = add i16 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V4:.*]] = add i16 %[[V0]], %[[V1]] +; CHECK-NEXT: %[[V5:.*]] = add i16 %[[V2]], %[[V3]] +; CHECK-NEXT: add i16 %[[V4]], %[[V5]] +define void @add_i16(i16* noalias %A, i16* noalias %B, i16* noalias %C, i16* noalias %D, i16* noalias %E, i16* noalias %F, i16* noalias %G, i16* noalias %H, i16* noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds i16, i16* %B, i64 %indvars.iv + %0 = load i16, i16* %arrayidx.1, align 1 + %arrayidx.2 = getelementptr inbounds i16, i16* %C, i64 %indvars.iv + %1 = load i16, i16* %arrayidx.2, align 1 + %2 = add i16 %1, %0 + %arrayidx.3 = getelementptr inbounds i16, i16* %D, i64 %indvars.iv + %3 = load i16, i16* %arrayidx.3, align 1 + %4 = add i16 %2, %3 + %arrayidx.4 = getelementptr inbounds i16, i16* %E, i64 %indvars.iv + %5 = load i16, i16* %arrayidx.4, align 1 + %6 = add i16 %4, %5 + %arrayidx.5 = getelementptr inbounds i16, i16* %F, i64 %indvars.iv + %7 = load i16, i16* %arrayidx.5, align 1 + %8 = add i16 %6, %7 + %arrayidx.6 = getelementptr inbounds i16, i16* %G, i64 %indvars.iv + %9 = load i16, i16* %arrayidx.6, align 1 + %10 = add i16 %8, %9 + %arrayidx.7 = getelementptr inbounds i16, i16* %H, i64 %indvars.iv + %11 = load i16, i16* %arrayidx.7, align 1 + %12 = add i16 %10, %11 + %arrayidx.8 = getelementptr inbounds i16, i16* %I, i64 %indvars.iv + %13 = load i16, i16* %arrayidx.8, align 1 + %14 = add i16 %12, %13 + %arrayidx.9 = getelementptr inbounds i16, i16* %A, i64 %indvars.iv + store i16 %14, i16* %arrayidx.9, align 1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +; CHECK-LABEL: @add_i32( +; CHECK: %[[V0:.*]] = add i32 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V1:.*]] = add i32 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V2:.*]] = add i32 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V3:.*]] = add i32 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V4:.*]] = add i32 %[[V0]], %[[V1]] +; CHECK-NEXT: %[[V5:.*]] = add i32 %[[V2]], %[[V3]] +; CHECK-NEXT: add i32 %[[V4]], %[[V5]] +define void @add_i32(i32* noalias %A, i32* noalias %B, i32* noalias %C, i32* noalias %D, i32* noalias %E, i32* noalias %F, i32* noalias %G, i32* noalias %H, i32* noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv + %0 = load i32, i32* %arrayidx.1, align 1 + %arrayidx.2 = getelementptr inbounds i32, i32* %C, i64 %indvars.iv + %1 = load i32, i32* %arrayidx.2, align 1 + %2 = add i32 %1, %0 + %arrayidx.3 = getelementptr inbounds i32, i32* %D, i64 %indvars.iv + %3 = load i32, i32* %arrayidx.3, align 1 + %4 = add i32 %2, %3 + %arrayidx.4 = getelementptr inbounds i32, i32* %E, i64 %indvars.iv + %5 = load i32, i32* %arrayidx.4, align 1 + %6 = add i32 %4, %5 + %arrayidx.5 = getelementptr inbounds i32, i32* %F, i64 %indvars.iv + %7 = load i32, i32* %arrayidx.5, align 1 + %8 = add i32 %6, %7 + %arrayidx.6 = getelementptr inbounds i32, i32* %G, i64 %indvars.iv + %9 = load i32, i32* %arrayidx.6, align 1 + %10 = add i32 %8, %9 + %arrayidx.7 = getelementptr inbounds i32, i32* %H, i64 %indvars.iv + %11 = load i32, i32* %arrayidx.7, align 1 + %12 = add i32 %10, %11 + %arrayidx.8 = getelementptr inbounds i32, i32* %I, i64 %indvars.iv + %13 = load i32, i32* %arrayidx.8, align 1 + %14 = add i32 %12, %13 + %arrayidx.9 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + store i32 %14, i32* %arrayidx.9, align 1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +; CHECK-LABEL: @add_i64( +; CHECK: %[[V0:.*]] = add i64 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V1:.*]] = add i64 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V2:.*]] = add i64 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V3:.*]] = add i64 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V4:.*]] = add i64 %[[V0]], %[[V1]] +; CHECK-NEXT: %[[V5:.*]] = add i64 %[[V2]], %[[V3]] +; CHECK-NEXT: add i64 %[[V4]], %[[V5]] +define void @add_i64(i64* noalias %A, i64* noalias %B, i64* noalias %C, i64* noalias %D, i64* noalias %E, i64* noalias %F, i64* noalias %G, i64* noalias %H, i64* noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds i64, i64* %B, i64 %indvars.iv + %0 = load i64, i64* %arrayidx.1, align 1 + %arrayidx.2 = getelementptr inbounds i64, i64* %C, i64 %indvars.iv + %1 = load i64, i64* %arrayidx.2, align 1 + %2 = add i64 %1, %0 + %arrayidx.3 = getelementptr inbounds i64, i64* %D, i64 %indvars.iv + %3 = load i64, i64* %arrayidx.3, align 1 + %4 = add i64 %2, %3 + %arrayidx.4 = getelementptr inbounds i64, i64* %E, i64 %indvars.iv + %5 = load i64, i64* %arrayidx.4, align 1 + %6 = add i64 %4, %5 + %arrayidx.5 = getelementptr inbounds i64, i64* %F, i64 %indvars.iv + %7 = load i64, i64* %arrayidx.5, align 1 + %8 = add i64 %6, %7 + %arrayidx.6 = getelementptr inbounds i64, i64* %G, i64 %indvars.iv + %9 = load i64, i64* %arrayidx.6, align 1 + %10 = add i64 %8, %9 + %arrayidx.7 = getelementptr inbounds i64, i64* %H, i64 %indvars.iv + %11 = load i64, i64* %arrayidx.7, align 1 + %12 = add i64 %10, %11 + %arrayidx.8 = getelementptr inbounds i64, i64* %I, i64 %indvars.iv + %13 = load i64, i64* %arrayidx.8, align 1 + %14 = add i64 %12, %13 + %arrayidx.9 = getelementptr inbounds i64, i64* %A, i64 %indvars.iv + store i64 %14, i64* %arrayidx.9, align 1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} Index: test/Transforms/TreeHeightReduction/integer-add-with-constant.ll =================================================================== --- test/Transforms/TreeHeightReduction/integer-add-with-constant.ll +++ test/Transforms/TreeHeightReduction/integer-add-with-constant.ll @@ -0,0 +1,217 @@ +; RUN: opt -O1 -S < %s | FileCheck %s + +; CHECK-LABEL: @add_with_constant_i8( +; CHECK: %[[V0:.*]] = add i8 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V1:.*]] = add i8 {{.*}}, %[[V0]] +; CHECK-NEXT: %[[V2:.*]] = add i8 {{.*}}, 10 +; CHECK-NEXT: %[[V3:.*]] = add i8 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V4:.*]] = add i8 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V5:.*]] = add i8 %[[V1]], %[[V2]] +; CHECK-NEXT: %[[V6:.*]] = add i8 %[[V3]], %[[V4]] +; CHECK-NEXT: add i8 %[[V5]], %[[V6]] +define void @add_with_constant_i8(i8* noalias %A, i8* noalias %B, i8* noalias %C, i8* noalias %D, i8* noalias %E, i8* noalias %F, i8* noalias %G, i8* noalias %H, i8* noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds i8, i8* %B, i64 %indvars.iv + %0 = load i8, i8* %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds i8, i8* %C, i64 %indvars.iv + %1 = load i8, i8* %arrayidx.2, align 4 + %arrayidx.3 = getelementptr inbounds i8, i8* %D, i64 %indvars.iv + %2 = load i8, i8* %arrayidx.3, align 4 + %arrayidx.4 = getelementptr inbounds i8, i8* %E, i64 %indvars.iv + %3 = load i8, i8* %arrayidx.4, align 4 + %arrayidx.5 = getelementptr inbounds i8, i8* %F, i64 %indvars.iv + %4 = load i8, i8* %arrayidx.5, align 4 + %arrayidx.6 = getelementptr inbounds i8, i8* %G, i64 %indvars.iv + %5 = load i8, i8* %arrayidx.6, align 4 + %arrayidx.7 = getelementptr inbounds i8, i8* %H, i64 %indvars.iv + %6 = load i8, i8* %arrayidx.7, align 4 + %arrayidx.8 = getelementptr inbounds i8, i8* %I, i64 %indvars.iv + %7 = load i8, i8* %arrayidx.8, align 4 + %8 = add i8 %0, 10 + %9 = add i8 %8, %1 + %10 = add i8 %9, %2 + %11 = add i8 %10, %3 + %12 = add i8 %11, %4 + %13 = add i8 %12, %5 + %14 = add i8 %13, %6 + %15 = add i8 %14, %7 + %arrayidx.9 = getelementptr inbounds i8, i8* %A, i64 %indvars.iv + store i8 %15, i8* %arrayidx.9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +; CHECK-LABEL: @add_with_constant_i16( +; CHECK: %[[V0:.*]] = add i16 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V1:.*]] = add i16 {{.*}}, %[[V0]] +; CHECK-NEXT: %[[V2:.*]] = add i16 {{.*}}, 10 +; CHECK-NEXT: %[[V3:.*]] = add i16 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V4:.*]] = add i16 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V5:.*]] = add i16 %[[V1]], %[[V2]] +; CHECK-NEXT: %[[V6:.*]] = add i16 %[[V3]], %[[V4]] +; CHECK-NEXT: add i16 %[[V5]], %[[V6]] +define void @add_with_constant_i16(i16* noalias %A, i16* noalias %B, i16* noalias %C, i16* noalias %D, i16* noalias %E, i16* noalias %F, i16* noalias %G, i16* noalias %H, i16* noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds i16, i16* %B, i64 %indvars.iv + %0 = load i16, i16* %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds i16, i16* %C, i64 %indvars.iv + %1 = load i16, i16* %arrayidx.2, align 4 + %arrayidx.3 = getelementptr inbounds i16, i16* %D, i64 %indvars.iv + %2 = load i16, i16* %arrayidx.3, align 4 + %arrayidx.4 = getelementptr inbounds i16, i16* %E, i64 %indvars.iv + %3 = load i16, i16* %arrayidx.4, align 4 + %arrayidx.5 = getelementptr inbounds i16, i16* %F, i64 %indvars.iv + %4 = load i16, i16* %arrayidx.5, align 4 + %arrayidx.6 = getelementptr inbounds i16, i16* %G, i64 %indvars.iv + %5 = load i16, i16* %arrayidx.6, align 4 + %arrayidx.7 = getelementptr inbounds i16, i16* %H, i64 %indvars.iv + %6 = load i16, i16* %arrayidx.7, align 4 + %arrayidx.8 = getelementptr inbounds i16, i16* %I, i64 %indvars.iv + %7 = load i16, i16* %arrayidx.8, align 4 + %8 = add i16 %0, 10 + %9 = add i16 %8, %1 + %10 = add i16 %9, %2 + %11 = add i16 %10, %3 + %12 = add i16 %11, %4 + %13 = add i16 %12, %5 + %14 = add i16 %13, %6 + %15 = add i16 %14, %7 + %arrayidx.9 = getelementptr inbounds i16, i16* %A, i64 %indvars.iv + store i16 %15, i16* %arrayidx.9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +; CHECK-LABEL: @add_with_constant_i32( +; CHECK: %[[V0:.*]] = add i32 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V1:.*]] = add i32 {{.*}}, %[[V0]] +; CHECK-NEXT: %[[V2:.*]] = add i32 {{.*}}, 10 +; CHECK-NEXT: %[[V3:.*]] = add i32 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V4:.*]] = add i32 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V5:.*]] = add i32 %[[V1]], %[[V2]] +; CHECK-NEXT: %[[V6:.*]] = add i32 %[[V3]], %[[V4]] +; CHECK-NEXT: add i32 %[[V5]], %[[V6]] +define void @add_with_constant_i32(i32* noalias %A, i32* noalias %B, i32* noalias %C, i32* noalias %D, i32* noalias %E, i32* noalias %F, i32* noalias %G, i32* noalias %H, i32* noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv + %0 = load i32, i32* %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds i32, i32* %C, i64 %indvars.iv + %1 = load i32, i32* %arrayidx.2, align 4 + %arrayidx.3 = getelementptr inbounds i32, i32* %D, i64 %indvars.iv + %2 = load i32, i32* %arrayidx.3, align 4 + %arrayidx.4 = getelementptr inbounds i32, i32* %E, i64 %indvars.iv + %3 = load i32, i32* %arrayidx.4, align 4 + %arrayidx.5 = getelementptr inbounds i32, i32* %F, i64 %indvars.iv + %4 = load i32, i32* %arrayidx.5, align 4 + %arrayidx.6 = getelementptr inbounds i32, i32* %G, i64 %indvars.iv + %5 = load i32, i32* %arrayidx.6, align 4 + %arrayidx.7 = getelementptr inbounds i32, i32* %H, i64 %indvars.iv + %6 = load i32, i32* %arrayidx.7, align 4 + %arrayidx.8 = getelementptr inbounds i32, i32* %I, i64 %indvars.iv + %7 = load i32, i32* %arrayidx.8, align 4 + %8 = add i32 %0, 10 + %9 = add i32 %8, %1 + %10 = add i32 %9, %2 + %11 = add i32 %10, %3 + %12 = add i32 %11, %4 + %13 = add i32 %12, %5 + %14 = add i32 %13, %6 + %15 = add i32 %14, %7 + %arrayidx.9 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + store i32 %15, i32* %arrayidx.9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +; CHECK-LABEL: @add_with_constant_i64( +; CHECK: %[[V0:.*]] = add i64 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V1:.*]] = add i64 {{.*}}, %[[V0]] +; CHECK-NEXT: %[[V2:.*]] = add i64 {{.*}}, 10 +; CHECK-NEXT: %[[V3:.*]] = add i64 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V4:.*]] = add i64 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V5:.*]] = add i64 %[[V1]], %[[V2]] +; CHECK-NEXT: %[[V6:.*]] = add i64 %[[V3]], %[[V4]] +; CHECK-NEXT: add i64 %[[V5]], %[[V6]] +define void @add_with_constant_i64(i64* noalias %A, i64* noalias %B, i64* noalias %C, i64* noalias %D, i64* noalias %E, i64* noalias %F, i64* noalias %G, i64* noalias %H, i64* noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds i64, i64* %B, i64 %indvars.iv + %0 = load i64, i64* %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds i64, i64* %C, i64 %indvars.iv + %1 = load i64, i64* %arrayidx.2, align 4 + %arrayidx.3 = getelementptr inbounds i64, i64* %D, i64 %indvars.iv + %2 = load i64, i64* %arrayidx.3, align 4 + %arrayidx.4 = getelementptr inbounds i64, i64* %E, i64 %indvars.iv + %3 = load i64, i64* %arrayidx.4, align 4 + %arrayidx.5 = getelementptr inbounds i64, i64* %F, i64 %indvars.iv + %4 = load i64, i64* %arrayidx.5, align 4 + %arrayidx.6 = getelementptr inbounds i64, i64* %G, i64 %indvars.iv + %5 = load i64, i64* %arrayidx.6, align 4 + %arrayidx.7 = getelementptr inbounds i64, i64* %H, i64 %indvars.iv + %6 = load i64, i64* %arrayidx.7, align 4 + %arrayidx.8 = getelementptr inbounds i64, i64* %I, i64 %indvars.iv + %7 = load i64, i64* %arrayidx.8, align 4 + %8 = add i64 %0, 10 + %9 = add i64 %8, %1 + %10 = add i64 %9, %2 + %11 = add i64 %10, %3 + %12 = add i64 %11, %4 + %13 = add i64 %12, %5 + %14 = add i64 %13, %6 + %15 = add i64 %14, %7 + %arrayidx.9 = getelementptr inbounds i64, i64* %A, i64 %indvars.iv + store i64 %15, i64* %arrayidx.9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} Index: test/Transforms/TreeHeightReduction/integer-mult-only.ll =================================================================== --- test/Transforms/TreeHeightReduction/integer-mult-only.ll +++ test/Transforms/TreeHeightReduction/integer-mult-only.ll @@ -0,0 +1,209 @@ +; RUN: opt -O1 -S < %s | FileCheck %s + +; CHECK-LABEL: @mul_i8( +; CHECK: %[[V0:.*]] = mul i8 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V1:.*]] = mul i8 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V2:.*]] = mul i8 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V3:.*]] = mul i8 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V4:.*]] = mul i8 %[[V0]], %[[V1]] +; CHECK-NEXT: %[[V5:.*]] = mul i8 %[[V2]], %[[V3]] +; CHECK-NEXT: mul i8 %[[V4]], %[[V5]] +define void @mul_i8(i8* noalias %A, i8* noalias %B, i8* noalias %C, i8* noalias %D, i8* noalias %E, i8* noalias %F, i8* noalias %G, i8* noalias %H, i8* noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds i8, i8* %B, i64 %indvars.iv + %0 = load i8, i8* %arrayidx.1, align 1 + %arrayidx.2 = getelementptr inbounds i8, i8* %C, i64 %indvars.iv + %1 = load i8, i8* %arrayidx.2, align 1 + %2 = mul i8 %1, %0 + %arrayidx.3 = getelementptr inbounds i8, i8* %D, i64 %indvars.iv + %3 = load i8, i8* %arrayidx.3, align 1 + %4 = mul i8 %2, %3 + %arrayidx.4 = getelementptr inbounds i8, i8* %E, i64 %indvars.iv + %5 = load i8, i8* %arrayidx.4, align 1 + %6 = mul i8 %4, %5 + %arrayidx.5 = getelementptr inbounds i8, i8* %F, i64 %indvars.iv + %7 = load i8, i8* %arrayidx.5, align 1 + %8 = mul i8 %6, %7 + %arrayidx.6 = getelementptr inbounds i8, i8* %G, i64 %indvars.iv + %9 = load i8, i8* %arrayidx.6, align 1 + %10 = mul i8 %8, %9 + %arrayidx.7 = getelementptr inbounds i8, i8* %H, i64 %indvars.iv + %11 = load i8, i8* %arrayidx.7, align 1 + %12 = mul i8 %10, %11 + %arrayidx.8 = getelementptr inbounds i8, i8* %I, i64 %indvars.iv + %13 = load i8, i8* %arrayidx.8, align 1 + %14 = mul i8 %12, %13 + %arrayidx.9 = getelementptr inbounds i8, i8* %A, i64 %indvars.iv + store i8 %14, i8* %arrayidx.9, align 1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +; CHECK-LABEL: @mul_i16( +; CHECK: %[[V0:.*]] = mul i16 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V1:.*]] = mul i16 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V2:.*]] = mul i16 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V3:.*]] = mul i16 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V4:.*]] = mul i16 %[[V0]], %[[V1]] +; CHECK-NEXT: %[[V5:.*]] = mul i16 %[[V2]], %[[V3]] +; CHECK-NEXT: mul i16 %[[V4]], %[[V5]] +define void @mul_i16(i16* noalias %A, i16* noalias %B, i16* noalias %C, i16* noalias %D, i16* noalias %E, i16* noalias %F, i16* noalias %G, i16* noalias %H, i16* noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds i16, i16* %B, i64 %indvars.iv + %0 = load i16, i16* %arrayidx.1, align 1 + %arrayidx.2 = getelementptr inbounds i16, i16* %C, i64 %indvars.iv + %1 = load i16, i16* %arrayidx.2, align 1 + %2 = mul i16 %1, %0 + %arrayidx.3 = getelementptr inbounds i16, i16* %D, i64 %indvars.iv + %3 = load i16, i16* %arrayidx.3, align 1 + %4 = mul i16 %2, %3 + %arrayidx.4 = getelementptr inbounds i16, i16* %E, i64 %indvars.iv + %5 = load i16, i16* %arrayidx.4, align 1 + %6 = mul i16 %4, %5 + %arrayidx.5 = getelementptr inbounds i16, i16* %F, i64 %indvars.iv + %7 = load i16, i16* %arrayidx.5, align 1 + %8 = mul i16 %6, %7 + %arrayidx.6 = getelementptr inbounds i16, i16* %G, i64 %indvars.iv + %9 = load i16, i16* %arrayidx.6, align 1 + %10 = mul i16 %8, %9 + %arrayidx.7 = getelementptr inbounds i16, i16* %H, i64 %indvars.iv + %11 = load i16, i16* %arrayidx.7, align 1 + %12 = mul i16 %10, %11 + %arrayidx.8 = getelementptr inbounds i16, i16* %I, i64 %indvars.iv + %13 = load i16, i16* %arrayidx.8, align 1 + %14 = mul i16 %12, %13 + %arrayidx.9 = getelementptr inbounds i16, i16* %A, i64 %indvars.iv + store i16 %14, i16* %arrayidx.9, align 1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +; CHECK-LABEL: @mul_i32( +; CHECK: %[[V0:.*]] = mul i32 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V1:.*]] = mul i32 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V2:.*]] = mul i32 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V3:.*]] = mul i32 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V4:.*]] = mul i32 %[[V0]], %[[V1]] +; CHECK-NEXT: %[[V5:.*]] = mul i32 %[[V2]], %[[V3]] +; CHECK-NEXT: mul i32 %[[V4]], %[[V5]] +define void @mul_i32(i32* noalias %A, i32* noalias %B, i32* noalias %C, i32* noalias %D, i32* noalias %E, i32* noalias %F, i32* noalias %G, i32* noalias %H, i32* noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv + %0 = load i32, i32* %arrayidx.1, align 1 + %arrayidx.2 = getelementptr inbounds i32, i32* %C, i64 %indvars.iv + %1 = load i32, i32* %arrayidx.2, align 1 + %2 = mul i32 %1, %0 + %arrayidx.3 = getelementptr inbounds i32, i32* %D, i64 %indvars.iv + %3 = load i32, i32* %arrayidx.3, align 1 + %4 = mul i32 %2, %3 + %arrayidx.4 = getelementptr inbounds i32, i32* %E, i64 %indvars.iv + %5 = load i32, i32* %arrayidx.4, align 1 + %6 = mul i32 %4, %5 + %arrayidx.5 = getelementptr inbounds i32, i32* %F, i64 %indvars.iv + %7 = load i32, i32* %arrayidx.5, align 1 + %8 = mul i32 %6, %7 + %arrayidx.6 = getelementptr inbounds i32, i32* %G, i64 %indvars.iv + %9 = load i32, i32* %arrayidx.6, align 1 + %10 = mul i32 %8, %9 + %arrayidx.7 = getelementptr inbounds i32, i32* %H, i64 %indvars.iv + %11 = load i32, i32* %arrayidx.7, align 1 + %12 = mul i32 %10, %11 + %arrayidx.8 = getelementptr inbounds i32, i32* %I, i64 %indvars.iv + %13 = load i32, i32* %arrayidx.8, align 1 + %14 = mul i32 %12, %13 + %arrayidx.9 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + store i32 %14, i32* %arrayidx.9, align 1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +; CHECK-LABEL: @mul_i64( +; CHECK: %[[V0:.*]] = mul i64 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V1:.*]] = mul i64 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V2:.*]] = mul i64 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V3:.*]] = mul i64 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V4:.*]] = mul i64 %[[V0]], %[[V1]] +; CHECK-NEXT: %[[V5:.*]] = mul i64 %[[V2]], %[[V3]] +; CHECK-NEXT: mul i64 %[[V4]], %[[V5]] +define void @mul_i64(i64* noalias %A, i64* noalias %B, i64* noalias %C, i64* noalias %D, i64* noalias %E, i64* noalias %F, i64* noalias %G, i64* noalias %H, i64* noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds i64, i64* %B, i64 %indvars.iv + %0 = load i64, i64* %arrayidx.1, align 1 + %arrayidx.2 = getelementptr inbounds i64, i64* %C, i64 %indvars.iv + %1 = load i64, i64* %arrayidx.2, align 1 + %2 = mul i64 %1, %0 + %arrayidx.3 = getelementptr inbounds i64, i64* %D, i64 %indvars.iv + %3 = load i64, i64* %arrayidx.3, align 1 + %4 = mul i64 %2, %3 + %arrayidx.4 = getelementptr inbounds i64, i64* %E, i64 %indvars.iv + %5 = load i64, i64* %arrayidx.4, align 1 + %6 = mul i64 %4, %5 + %arrayidx.5 = getelementptr inbounds i64, i64* %F, i64 %indvars.iv + %7 = load i64, i64* %arrayidx.5, align 1 + %8 = mul i64 %6, %7 + %arrayidx.6 = getelementptr inbounds i64, i64* %G, i64 %indvars.iv + %9 = load i64, i64* %arrayidx.6, align 1 + %10 = mul i64 %8, %9 + %arrayidx.7 = getelementptr inbounds i64, i64* %H, i64 %indvars.iv + %11 = load i64, i64* %arrayidx.7, align 1 + %12 = mul i64 %10, %11 + %arrayidx.8 = getelementptr inbounds i64, i64* %I, i64 %indvars.iv + %13 = load i64, i64* %arrayidx.8, align 1 + %14 = mul i64 %12, %13 + %arrayidx.9 = getelementptr inbounds i64, i64* %A, i64 %indvars.iv + store i64 %14, i64* %arrayidx.9, align 1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} Index: test/Transforms/TreeHeightReduction/integer-sub-only.ll =================================================================== --- test/Transforms/TreeHeightReduction/integer-sub-only.ll +++ test/Transforms/TreeHeightReduction/integer-sub-only.ll @@ -0,0 +1,209 @@ +; RUN: opt -O1 -S < %s | FileCheck %s + +; CHECK-LABEL: @sub_i8( +; CHECK: %[[V0:.*]] = sub i8 {{.*}}, {{.*}} +; CHECK: %[[V1:.*]] = sub i8 %[[V0]], %{{.*}} +; CHECK: %[[V2:.*]] = sub i8 %[[V1]], {{.*}} +; CHECK: %[[V3:.*]] = sub i8 %[[V2]], {{.*}} +; CHECK: %[[V4:.*]] = sub i8 %[[V3]], {{.*}} +; CHECK: %[[V5:.*]] = sub i8 %[[V4]], {{.*}} +; CHECK: sub i8 %[[V5]], {{.*}} +define void @sub_i8(i8* noalias %A, i8* noalias %B, i8* noalias %C, i8* noalias %D, i8* noalias %E, i8* noalias %F, i8* noalias %G, i8* noalias %H, i8* noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds i8, i8* %B, i64 %indvars.iv + %0 = load i8, i8* %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds i8, i8* %C, i64 %indvars.iv + %1 = load i8, i8* %arrayidx.2, align 4 + %2 = sub i8 %0, %1 + %arrayidx.3 = getelementptr inbounds i8, i8* %D, i64 %indvars.iv + %3 = load i8, i8* %arrayidx.3, align 4 + %4 = sub i8 %2, %3 + %arrayidx.4 = getelementptr inbounds i8, i8* %E, i64 %indvars.iv + %5 = load i8, i8* %arrayidx.4, align 4 + %6 = sub i8 %4, %5 + %arrayidx.5 = getelementptr inbounds i8, i8* %F, i64 %indvars.iv + %7 = load i8, i8* %arrayidx.5, align 4 + %8 = sub i8 %6, %7 + %arrayidx.6 = getelementptr inbounds i8, i8* %G, i64 %indvars.iv + %9 = load i8, i8* %arrayidx.6, align 4 + %10 = sub i8 %8, %9 + %arrayidx.7 = getelementptr inbounds i8, i8* %H, i64 %indvars.iv + %11 = load i8, i8* %arrayidx.7, align 4 + %12 = sub i8 %10, %11 + %arrayidx.8 = getelementptr inbounds i8, i8* %I, i64 %indvars.iv + %13 = load i8, i8* %arrayidx.8, align 4 + %14 = sub i8 %12, %13 + %arrayidx.9 = getelementptr inbounds i8, i8* %A, i64 %indvars.iv + store i8 %14, i8* %arrayidx.9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +; CHECK-LABEL: @sub_i16( +; CHECK: %[[V0:.*]] = sub i16 {{.*}}, {{.*}} +; CHECK: %[[V1:.*]] = sub i16 %[[V0]], %{{.*}} +; CHECK: %[[V2:.*]] = sub i16 %[[V1]], {{.*}} +; CHECK: %[[V3:.*]] = sub i16 %[[V2]], {{.*}} +; CHECK: %[[V4:.*]] = sub i16 %[[V3]], {{.*}} +; CHECK: %[[V5:.*]] = sub i16 %[[V4]], {{.*}} +; CHECK: sub i16 %[[V5]], {{.*}} +define void @sub_i16(i16* noalias %A, i16* noalias %B, i16* noalias %C, i16* noalias %D, i16* noalias %E, i16* noalias %F, i16* noalias %G, i16* noalias %H, i16* noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds i16, i16* %B, i64 %indvars.iv + %0 = load i16, i16* %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds i16, i16* %C, i64 %indvars.iv + %1 = load i16, i16* %arrayidx.2, align 4 + %2 = sub i16 %0, %1 + %arrayidx.3 = getelementptr inbounds i16, i16* %D, i64 %indvars.iv + %3 = load i16, i16* %arrayidx.3, align 4 + %4 = sub i16 %2, %3 + %arrayidx.4 = getelementptr inbounds i16, i16* %E, i64 %indvars.iv + %5 = load i16, i16* %arrayidx.4, align 4 + %6 = sub i16 %4, %5 + %arrayidx.5 = getelementptr inbounds i16, i16* %F, i64 %indvars.iv + %7 = load i16, i16* %arrayidx.5, align 4 + %8 = sub i16 %6, %7 + %arrayidx.6 = getelementptr inbounds i16, i16* %G, i64 %indvars.iv + %9 = load i16, i16* %arrayidx.6, align 4 + %10 = sub i16 %8, %9 + %arrayidx.7 = getelementptr inbounds i16, i16* %H, i64 %indvars.iv + %11 = load i16, i16* %arrayidx.7, align 4 + %12 = sub i16 %10, %11 + %arrayidx.8 = getelementptr inbounds i16, i16* %I, i64 %indvars.iv + %13 = load i16, i16* %arrayidx.8, align 4 + %14 = sub i16 %12, %13 + %arrayidx.9 = getelementptr inbounds i16, i16* %A, i64 %indvars.iv + store i16 %14, i16* %arrayidx.9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +; CHECK-LABEL: @sub_i32( +; CHECK: %[[V0:.*]] = sub i32 {{.*}}, {{.*}} +; CHECK: %[[V1:.*]] = sub i32 %[[V0]], %{{.*}} +; CHECK: %[[V2:.*]] = sub i32 %[[V1]], {{.*}} +; CHECK: %[[V3:.*]] = sub i32 %[[V2]], {{.*}} +; CHECK: %[[V4:.*]] = sub i32 %[[V3]], {{.*}} +; CHECK: %[[V5:.*]] = sub i32 %[[V4]], {{.*}} +; CHECK: sub i32 %[[V5]], {{.*}} +define void @sub_i32(i32* noalias %A, i32* noalias %B, i32* noalias %C, i32* noalias %D, i32* noalias %E, i32* noalias %F, i32* noalias %G, i32* noalias %H, i32* noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv + %0 = load i32, i32* %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds i32, i32* %C, i64 %indvars.iv + %1 = load i32, i32* %arrayidx.2, align 4 + %2 = sub i32 %0, %1 + %arrayidx.3 = getelementptr inbounds i32, i32* %D, i64 %indvars.iv + %3 = load i32, i32* %arrayidx.3, align 4 + %4 = sub i32 %2, %3 + %arrayidx.4 = getelementptr inbounds i32, i32* %E, i64 %indvars.iv + %5 = load i32, i32* %arrayidx.4, align 4 + %6 = sub i32 %4, %5 + %arrayidx.5 = getelementptr inbounds i32, i32* %F, i64 %indvars.iv + %7 = load i32, i32* %arrayidx.5, align 4 + %8 = sub i32 %6, %7 + %arrayidx.6 = getelementptr inbounds i32, i32* %G, i64 %indvars.iv + %9 = load i32, i32* %arrayidx.6, align 4 + %10 = sub i32 %8, %9 + %arrayidx.7 = getelementptr inbounds i32, i32* %H, i64 %indvars.iv + %11 = load i32, i32* %arrayidx.7, align 4 + %12 = sub i32 %10, %11 + %arrayidx.8 = getelementptr inbounds i32, i32* %I, i64 %indvars.iv + %13 = load i32, i32* %arrayidx.8, align 4 + %14 = sub i32 %12, %13 + %arrayidx.9 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + store i32 %14, i32* %arrayidx.9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +; CHECK-LABEL: @sub_i64( +; CHECK: %[[V0:.*]] = sub i64 {{.*}}, {{.*}} +; CHECK: %[[V1:.*]] = sub i64 %[[V0]], %{{.*}} +; CHECK: %[[V2:.*]] = sub i64 %[[V1]], {{.*}} +; CHECK: %[[V3:.*]] = sub i64 %[[V2]], {{.*}} +; CHECK: %[[V4:.*]] = sub i64 %[[V3]], {{.*}} +; CHECK: %[[V5:.*]] = sub i64 %[[V4]], {{.*}} +; CHECK: sub i64 %[[V5]], {{.*}} +define void @sub_i64(i64* noalias %A, i64* noalias %B, i64* noalias %C, i64* noalias %D, i64* noalias %E, i64* noalias %F, i64* noalias %G, i64* noalias %H, i64* noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds i64, i64* %B, i64 %indvars.iv + %0 = load i64, i64* %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds i64, i64* %C, i64 %indvars.iv + %1 = load i64, i64* %arrayidx.2, align 4 + %2 = sub i64 %0, %1 + %arrayidx.3 = getelementptr inbounds i64, i64* %D, i64 %indvars.iv + %3 = load i64, i64* %arrayidx.3, align 4 + %4 = sub i64 %2, %3 + %arrayidx.4 = getelementptr inbounds i64, i64* %E, i64 %indvars.iv + %5 = load i64, i64* %arrayidx.4, align 4 + %6 = sub i64 %4, %5 + %arrayidx.5 = getelementptr inbounds i64, i64* %F, i64 %indvars.iv + %7 = load i64, i64* %arrayidx.5, align 4 + %8 = sub i64 %6, %7 + %arrayidx.6 = getelementptr inbounds i64, i64* %G, i64 %indvars.iv + %9 = load i64, i64* %arrayidx.6, align 4 + %10 = sub i64 %8, %9 + %arrayidx.7 = getelementptr inbounds i64, i64* %H, i64 %indvars.iv + %11 = load i64, i64* %arrayidx.7, align 4 + %12 = sub i64 %10, %11 + %arrayidx.8 = getelementptr inbounds i64, i64* %I, i64 %indvars.iv + %13 = load i64, i64* %arrayidx.8, align 4 + %14 = sub i64 %12, %13 + %arrayidx.9 = getelementptr inbounds i64, i64* %A, i64 %indvars.iv + store i64 %14, i64* %arrayidx.9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} Index: test/Transforms/TreeHeightReduction/leaf-num-check.ll =================================================================== --- test/Transforms/TreeHeightReduction/leaf-num-check.ll +++ test/Transforms/TreeHeightReduction/leaf-num-check.ll @@ -0,0 +1,69 @@ +; RUN: opt -O1 -S < %s | FileCheck %s + +; CHECK-LABEL: @leaf_num_is_3( +; CHECK: %[[V0:.*]] = add nsw i32 {{.*}}, {{.*}} +; CHECK: add nsw i32 %[[V0]], {{.*}} +define void @leaf_num_is_3(i32* noalias %A, i32* noalias %B, i32* noalias %C, i32* noalias %D, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv + %0 = load i32, i32* %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds i32, i32* %C, i64 %indvars.iv + %1 = load i32, i32* %arrayidx.2, align 4 + %2 = add nsw i32 %1, %0 + %arrayidx.3 = getelementptr inbounds i32, i32* %D, i64 %indvars.iv + %3 = load i32, i32* %arrayidx.3, align 4 + %4 = add nsw i32 %2, %3 + %arrayidx.4 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + store i32 %4, i32* %arrayidx.1, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +; CHECK-LABEL: @leaf_num_is_4( +; CHECK: %[[V0:.*]] = add nsw i32 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V1:.*]] = add nsw i32 {{.*}}, {{.*}} +; CHECK-NEXT: add nsw i32 %[[V0]], %[[V1]] +define void @leaf_num_is_4(i32* noalias %A, i32* noalias %B, i32* noalias %C, i32* noalias %D, i32* noalias %E, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv + %0 = load i32, i32* %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds i32, i32* %C, i64 %indvars.iv + %1 = load i32, i32* %arrayidx.2, align 4 + %2 = add nsw i32 %1, %0 + %arrayidx.3 = getelementptr inbounds i32, i32* %D, i64 %indvars.iv + %3 = load i32, i32* %arrayidx.3, align 4 + %4 = add nsw i32 %2, %3 + %arrayidx.4 = getelementptr inbounds i32, i32* %E, i64 %indvars.iv + %5 = load i32, i32* %arrayidx.4, align 4 + %6 = add nsw i32 %4, %5 + %arrayidx.5 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + store i32 %6, i32* %arrayidx.5, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +}