Index: llvm/include/llvm/Transforms/Scalar/TreeHeightReduction.h =================================================================== --- /dev/null +++ llvm/include/llvm/Transforms/Scalar/TreeHeightReduction.h @@ -0,0 +1,25 @@ +//===- TreeHeightReduction.h - Minimize the height of an operation tree ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_SCALAR_TREEHEIGHTREDUCTION_H +#define LLVM_TRANSFORMS_SCALAR_TREEHEIGHTREDUCTION_H + +#include "llvm/IR/PassManager.h" +#include "llvm/Transforms/Scalar/LoopPassManager.h" + +namespace llvm { + +class TreeHeightReductionPass : public PassInfoMixin { +public: + PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, + LoopStandardAnalysisResults &AR, LPMUpdater &U); +}; + +} // namespace llvm + +#endif // LLVM_TRANSFORMS_SCALAR_TREEHEIGHTREDUCTION_H Index: llvm/lib/Passes/PassBuilder.cpp =================================================================== --- llvm/lib/Passes/PassBuilder.cpp +++ llvm/lib/Passes/PassBuilder.cpp @@ -215,6 +215,7 @@ #include "llvm/Transforms/Scalar/StructurizeCFG.h" #include "llvm/Transforms/Scalar/TLSVariableHoist.h" #include "llvm/Transforms/Scalar/TailRecursionElimination.h" +#include "llvm/Transforms/Scalar/TreeHeightReduction.h" #include "llvm/Transforms/Scalar/WarnMissedTransforms.h" #include "llvm/Transforms/Utils/AddDiscriminators.h" #include "llvm/Transforms/Utils/AssumeBundleBuilder.h" Index: llvm/lib/Passes/PassBuilderPipelines.cpp =================================================================== --- llvm/lib/Passes/PassBuilderPipelines.cpp +++ llvm/lib/Passes/PassBuilderPipelines.cpp @@ -114,6 +114,7 @@ #include "llvm/Transforms/Scalar/SimplifyCFG.h" #include "llvm/Transforms/Scalar/SpeculativeExecution.h" #include "llvm/Transforms/Scalar/TailRecursionElimination.h" +#include "llvm/Transforms/Scalar/TreeHeightReduction.h" #include "llvm/Transforms/Scalar/WarnMissedTransforms.h" #include "llvm/Transforms/Utils/AddDiscriminators.h" #include "llvm/Transforms/Utils/AssumeBundleBuilder.h" @@ -1241,6 +1242,11 @@ addVectorPasses(Level, OptimizePM, /* IsFullLTO */ false); + // Increase instruction-level parallelism by reordering associative and + // commutative operations in a loop. Putting this pass after the loop + // unrolling pass will produce a better effect if the loop has reductions. + OptimizePM.addPass(createFunctionToLoopPassAdaptor(TreeHeightReductionPass())); + // LoopSink pass sinks instructions hoisted by LICM, which serves as a // canonicalization pass that enables other optimizations. As a result, // LoopSink pass needs to be a very late IR pass to avoid undoing LICM Index: llvm/lib/Passes/PassRegistry.def =================================================================== --- llvm/lib/Passes/PassRegistry.def +++ llvm/lib/Passes/PassRegistry.def @@ -527,6 +527,7 @@ LOOP_PASS("loop-bound-split", LoopBoundSplitPass()) LOOP_PASS("loop-reroll", LoopRerollPass()) LOOP_PASS("loop-versioning-licm", LoopVersioningLICMPass()) +LOOP_PASS("tree-height-reduction", TreeHeightReductionPass()) #undef LOOP_PASS #ifndef LOOP_PASS_WITH_PARAMS Index: llvm/lib/Transforms/Scalar/CMakeLists.txt =================================================================== --- llvm/lib/Transforms/Scalar/CMakeLists.txt +++ llvm/lib/Transforms/Scalar/CMakeLists.txt @@ -77,6 +77,7 @@ StructurizeCFG.cpp TailRecursionElimination.cpp TLSVariableHoist.cpp + TreeHeightReduction.cpp WarnMissedTransforms.cpp ADDITIONAL_HEADER_DIRS Index: llvm/lib/Transforms/Scalar/TreeHeightReduction.cpp =================================================================== --- /dev/null +++ llvm/lib/Transforms/Scalar/TreeHeightReduction.cpp @@ -0,0 +1,726 @@ +//===- TreeHeightReduction.cpp - Minimize the height of an operation tree -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass implements a tree-height-reduction pass. +// +// Tree height reduction is an optimization to increase instruction level +// parallelism by transforming an operation tree like the following. +// +// Before After +// +// _ N1 _ _________ N1 _________ +// | | | | +// _ N2 _ L1 ___ N2 ___ ___ N3 ___ +// | | | | | | +// _ N3 _ L2 ---> _ N4 _ _ N5 _ _ N6 _ _ N7 _ +// | | | | | | | | | | +// _ N4 _ L3 L1 L2 L3 L4 L5 L6 L7 L8 +// | | +// _ N5 _ L4 +// | | +// _ N6 _ L5 +// | | +// _ N7 _ L6 +// | | +// L7 L8 +// +//===----------------------------------------------------------------------===// +// +// The algorithm of tree height reduction is based on the paper: +// Katherine Coons, Warren Hunt, Bertrand A. Maher, Doug Burger, +// Kathryn S. McKinley. +// Optimal Huffman Tree-Height Reduction for Instruction-Level Parallelism. +// + +#include "llvm/ADT/Twine.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/User.h" +#include "llvm/IR/Value.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/TreeHeightReduction.h" +#include +#include +#include + +using namespace llvm; + +#define DEBUG_TYPE "tree-height-reduction" + +// Whether to apply tree height reduction to integer instructions. +static cl::opt EnableIntTHR( + "enable-int-thr", + cl::desc("Enable tree height reduction to integer instructions"), + cl::init(false)); + +// Whether to apply tree height reduction to floating-point instructions. +static cl::opt EnableFpTHR( + "enable-fp-thr", + cl::desc("Enable tree height reduction to floating-point instructions"), + cl::init(false)); + +// Minimum number of leaves to apply tree height reduction. +// Tree height reduction has effect only if the number of leaves is 4 or more. +static cl::opt MinLeaves( + "thr-min-leaves", + cl::desc("Minimum number of leaves to apply tree height reduction " + "(default = 4)"), + cl::init(4)); + +namespace { +class Node { +public: + explicit Node(Value *V) + : Inst(nullptr), DefValue(V), Parent(nullptr), Left(nullptr), + Right(nullptr), Latency(0), TotalCost(0) { + if (Instruction *I = dyn_cast(V)) { + Inst = I; + } + } + + /// Set the parent node of this node. + void setParent(Node *P) { Parent = P; } + + /// Set the left node of this node. + void setLeft(Node *L) { Left = L; } + + /// Set the right node of this node. + void setRight(Node *R) { Right = R; } + + /// Set the latency of this node. + void setLatency(InstructionCost L) { Latency = L; } + + /// Set the total cost of this node. + void setTotalCost(InstructionCost C) { TotalCost = C; } + + /// Get the original instruction of this node. + Instruction *getOrgInst() const { + assert(Inst && "Inst should not be nullptr."); + return Inst; + } + + /// Get the defined value of this node. + Value *getDefinedValue() const { + assert(DefValue && "DefValue should not be nullptr"); + return DefValue; + } + + /// Get the parent node of this node. + Node *getParent() const { return Parent; } + + /// Get the left node of this node. + Node *getLeft() const { return Left; } + + /// Get the right node of this node. + Node *getRight() const { return Right; } + + /// Get the latency of this node. + InstructionCost getLatency() const { return Latency; } + + /// Get the total cost of this node. + InstructionCost getTotalCost() const { return TotalCost; } + + /// Return true if this node is a branch (including a root). + bool isBranch() const { return Left != nullptr && Right != nullptr; } + + /// Return true if this node is a pure leaf. + bool isLeaf() const { return !isBranch(); } + + /// Return true if this node is a root. + bool isRoot() const { return Parent == nullptr; } + + /// Return true if this node is considered as a leaf node. + /// Tree height reduction can be applied only to nodes whose operation codes + /// are same. In addition, IR flags like 'nuw' must be same to preserve them. + /// So it is necessary to consider a node whose operation code or IR flags + /// are different from its parent's ones as a leaf node. + bool isConsideredAsLeaf() const { + if (isLeaf()) + return true; + if (isRoot()) + return false; + if (getOrgInst()->getOpcode() != getParent()->getOrgInst()->getOpcode() || + !getOrgInst()->hasSameSubclassOptionalData(getParent()->getOrgInst())) + return true; + return false; + } + + /// Update tree's latency under this node. + void updateTreeLatency(TargetTransformInfo *TTI); + + /// Update the latency of this node. + void updateNodeLatency(TargetTransformInfo *TTI); + + /// Update the latency of this node using the left or right node. + void updateLeftOrRightNodeLatency(TargetTransformInfo *TTI, Node *SubNode); + +private: + /// Original instruction of this node. + Instruction *Inst; + + /// Defined value of this node. + /// This may be a constant value. + Value *DefValue; + + /// Parent node of this node. + Node *Parent; + /// Left node and right node of this node. + Node *Left, *Right; + + /// Instruction latency. + InstructionCost Latency; + /// Total cost of nodes under this node. + InstructionCost TotalCost; +}; + +class TreeHeightReduction { +public: + enum struct InstTy { INTEGER, FLOATING_POINT }; + + TreeHeightReduction(InstTy Ty, TargetTransformInfo *TTI, + OptimizationRemarkEmitter *ORE) + : TargetInstTy(Ty), TTI(TTI), ORE(ORE) {} + + /// Apply tree height reduction to one basic block. + bool runOnBasicBlock(BasicBlock *BB); + +private: + /// Type of instruction included in the operation tree. + InstTy TargetInstTy; + + TargetTransformInfo *TTI; + OptimizationRemarkEmitter *ORE; + + /// Return true if 'I' is a target instruction of tree height reduction. + bool isTHRTargetInst(Instruction *I) const; + + /// Construct an operation tree from the value 'V'. + Node *constructTree(Value *V, BasicBlock *BB); + + /// Destruct an operation tree constructed by constructTree. + void destructTree(Node *N); + + /// Collect original instructions to be erased from the basic block. + void collectInstsToBeErasedFrom(Node *N, std::vector &Insts); + + /// Apply tree height reduction to the tree 'N'. + /// Returned value is a tree to which tree height reduction is applied. + Node *applyTreeHeightReduction(Node *N, bool isLeft); + + /// Collect leaf nodes and reusable branch nodes under the node 'N'. + void collectLeavesAndReusableBranches(Node *N, std::vector &Leaves, + std::vector &ReusableBranches); + + /// Construct an optimized subtree by applying tree height reduction. + Node *constructOptimizedSubtree(std::vector &Leaves, + std::vector &ReusableBranches); + + /// Combine two leaf elements, create a branch from them, and put it into + /// 'Leaves'. + void combineLeaves(std::vector &Leaves, Node *Op1, Node *Op2, + std::vector &ReusableBranches); + + /// Create IRs for the new tree. + void createIRs(Node *Root, std::set &GeneratedInsts); + + /// Create a new instruction for the node 'N' with operands 'Op1' and 'Op2'. + Value *createInst(IRBuilder<> &Builder, Node *N, Value *Op1, Value *Op2); + + /// Return true if 'I' is a root candidate. + bool isRootCandidate(Instruction &I) const { + if (!isTHRTargetInst(&I)) + return false; + for (unsigned i = 0; i < I.getNumOperands(); ++i) + if (isBranchCandidate(I.getOperand(i))) + return true; + return false; + } + + /// Return true if 'Op' is a branch candidate. + bool isBranchCandidate(Value *Op) const { + assert(Op && "Operand should not be nullptr"); + if (!Op->hasOneUse()) + return false; + if (Instruction *I = dyn_cast(Op)) + return isTHRTargetInst(I); + return false; + } + + void printTree(raw_ostream &OS, Node *N, const int Indent) const { + if (N->isLeaf()) + return; + + for (int i = 0; i < Indent; ++i) + OS << " "; + N->getOrgInst()->dump(); + + for (int i = 0; i < Indent + 4; ++i) + OS << " "; + OS << "Latency: " << N->getLatency(); + OS << ", TotalCost: " << N->getTotalCost() << "\n"; + + printTree(OS, N->getLeft(), Indent + 2); + printTree(OS, N->getRight(), Indent + 2); + } + + void printLeaves(raw_ostream &OS, std::vector &Leaves, + bool isBefore) const { + if (isBefore) + OS << " --- Before ---\n"; + else + OS << " --- After ---\n"; + for (auto *Node : Leaves) + printTree(OS, Node, 2); + } +}; +} // end anonymous namespace + +// BFS: Breadth First Search +static std::vector getNodesByBFS(Node *N) { + std::vector Nodes = {N}; + + for (unsigned i = 0; i < Nodes.size(); ++i) { + Node *CurNode = Nodes[i]; + if (CurNode->isBranch()) { + Nodes.push_back(CurNode->getLeft()); + Nodes.push_back(CurNode->getRight()); + } + } + + return std::move(Nodes); +} + +void Node::updateTreeLatency(TargetTransformInfo *TTI) { + std::vector Nodes = getNodesByBFS(this); + // Update node latency by bottom-up order. + for (auto Begin = Nodes.rbegin(), End = Nodes.rend(); Begin != End; ++Begin) { + Node *CurNode = *Begin; + CurNode->updateNodeLatency(TTI); + } +} + +void Node::updateNodeLatency(TargetTransformInfo *TTI) { + setLatency(0); + setTotalCost(0); + + if (isLeaf()) + return; + + // Tree height reduction minimizes the weighted sum of heights. + // The latency of each instruction is used as the height of each node. + // This makes an optimal operation tree even if operation types (add, mul, + // etc.) are mixed in the tree. + + updateLeftOrRightNodeLatency(TTI, getLeft()); + updateLeftOrRightNodeLatency(TTI, getRight()); + + Instruction *Inst = getOrgInst(); + InstructionCost InstLatency = + TTI->getInstructionCost(Inst, TargetTransformInfo::TCK_Latency); + // To balance the operation tree by sorting nodes by latency even if the + // latency is unknown, set a valid latency. + if (!InstLatency.isValid()) + InstLatency = InstructionCost(1); + setLatency(getLatency() + InstLatency); + setTotalCost(getTotalCost() + InstLatency); +} + +void Node::updateLeftOrRightNodeLatency(TargetTransformInfo *TTI, + Node *SubNode) { + assert(SubNode && "Left or right node should not be nullptr."); + const InstructionCost SubNodeLatency = SubNode->getLatency(); + if (SubNodeLatency > getLatency()) + setLatency(SubNodeLatency); + setTotalCost(getTotalCost() + SubNode->getTotalCost()); +} + +static void eraseOrgInsts(std::vector &Insts) { + for (auto *I : Insts) + I->eraseFromParent(); +} + +static bool isProfitableToApply(Node *Root) { + std::vector Nodes = getNodesByBFS(Root); + unsigned NumLeaves = 0; + + for (auto *N : Nodes) { + if (N->isLeaf()) + ++NumLeaves; + if (NumLeaves >= MinLeaves) + return true; + } + + return false; +} + +bool TreeHeightReduction::runOnBasicBlock(BasicBlock *BB) { + bool Applied = false; + std::set GeneratedInsts; + + // Search target instructions in the basic block reversely. + for (auto Begin = BB->rbegin(); Begin != BB->rend(); ++Begin) { + Instruction &I = *Begin; + if (GeneratedInsts.count(&I) == 1) + continue; + if (!isRootCandidate(I)) + continue; + + // Construct an original operation tree from the root instruction. + Node *OrgTree = constructTree(&I, BB); + + if (!isProfitableToApply(OrgTree)) { + destructTree(OrgTree); + continue; + } + + std::vector OrgInsts; + collectInstsToBeErasedFrom(OrgTree, OrgInsts); + + // Apply tree height reduction to the original tree 'OrgTree' + // and create a new tree 'ReducedTree'. + Node *ReducedTree = applyTreeHeightReduction(OrgTree, true); + assert(ReducedTree->getOrgInst() == OrgTree->getOrgInst() && + "OrgInst of ReducedTree and OrgTree should be same."); + + // Create IRs from the tree to which tree height reduction was applied. + createIRs(ReducedTree, GeneratedInsts); + + // The following optimization message output process must be called + // before 'eraseOrgInsts' because 'I' is erased there. + ORE->emit([&]() { + return OptimizationRemark(DEBUG_TYPE, "TreeHeightReduction", &I) + << "reduced tree height"; + }); + + --Begin; + eraseOrgInsts(OrgInsts); + Applied = true; + + // 'OrgTree' and 'ReducedTree' share memory, so it is enough to release + // memory of 'ReducedTree'. + destructTree(ReducedTree); + } + + return Applied; +} + +template static bool isUsedAtCmpInst(Instruction *I) { + for (User *U : I->users()) { + Instruction *UserInst = dyn_cast(U); + if (UserInst && isa(*UserInst)) + return true; + } + return false; +} + +// Return true if 'I' is an integer type and is a target instruction of tree +// height reduction. +static bool isIntgerInstTHRTarget(Instruction *I) { + if (!I->getType()->isIntegerTy()) + return false; + // 'I' which is used at ICmpInst may be an induction variable. + if (isUsedAtCmpInst(I)) + return false; + // Add, Mul, And, Or, or Xor + return I->isCommutative() && I->isAssociative(); +} + +// Return true if 'I' is a floating-point type and is a target instruction +// of tree height reduction. +static bool isFpInstTHRTarget(Instruction *I) { + if (!I->getType()->isFloatingPointTy()) + return false; + // FAdd or FMul with reassoc + return I->isCommutative() && I->isAssociative(); +} + +bool TreeHeightReduction::isTHRTargetInst(Instruction *I) const { + switch (TargetInstTy) { + case InstTy::INTEGER: + return isIntgerInstTHRTarget(I); + case InstTy::FLOATING_POINT: + return isFpInstTHRTarget(I); + default: + return false; + } +} + +Node *TreeHeightReduction::constructTree(Value *V, BasicBlock *BB) { + if (!isBranchCandidate(V)) + return new Node(V); + + Instruction *I = dyn_cast(V); + assert(I && "Instruction should not be nullptr."); + assert(I->getNumOperands() == 2 && "The number of operands should be 2."); + + if (I->getParent() != BB) + return new Node(V); + + Node *Parent = new Node(V); + + Value *LeftOp = I->getOperand(0); + Node *Left = constructTree(LeftOp, BB); + Parent->setLeft(Left); + Left->setParent(Parent); + + Value *RightOp = I->getOperand(1); + Node *Right = constructTree(RightOp, BB); + Parent->setRight(Right); + Right->setParent(Parent); + + return Parent; +} + +void TreeHeightReduction::destructTree(Node *N) { + std::vector Nodes = getNodesByBFS(N); + for (auto *CurNode : Nodes) + delete CurNode; +} + +void TreeHeightReduction::collectInstsToBeErasedFrom( + Node *N, std::vector &Insts) { + std::vector Nodes = getNodesByBFS(N); + for (auto *CurNode : Nodes) + // Instructions belonging to leaf nodes should be saved. + if (!CurNode->isLeaf()) + Insts.push_back(CurNode->getOrgInst()); +} + +Node *TreeHeightReduction::applyTreeHeightReduction(Node *N, bool isLeft) { + // Postorder depth-first search. + if (!N->isBranch()) + return N; + applyTreeHeightReduction(N->getLeft(), true); + applyTreeHeightReduction(N->getRight(), false); + + // Save original parent information. + Node *Parent = N->getParent(); + + std::vector Leaves; + // 'ReusableBranches' holds branch nodes which are reused when updating + // parent and child node's relationship in constructOptimizedSubtree(). + // By doing so, the amount of memory used can be reduced. + std::vector ReusableBranches; + collectLeavesAndReusableBranches(N, Leaves, ReusableBranches); + + Node *NewNode = constructOptimizedSubtree(Leaves, ReusableBranches); + NewNode->setParent(Parent); + if (Parent) { + if (isLeft) + Parent->setLeft(NewNode); + else + Parent->setRight(NewNode); + } + // Return value has a meaning only if 'Parent' is nullptr because + // this means 'Node' is a root node. + return NewNode; +} + +void TreeHeightReduction::collectLeavesAndReusableBranches( + Node *N, std::vector &Leaves, + std::vector &ReusableBranches) { + std::vector Worklist = {N}; + + // NOTE: Don't use 'getNodesAndLeavesByBFS'! Like that function, the following + // processing collects all leaves and all branches by BFS, but it is + // different from that function because this is BFS with a condition. + for (unsigned i = 0; i < Worklist.size(); ++i) { + Node *CurNode = Worklist[i]; + if (CurNode->isConsideredAsLeaf()) { + Leaves.push_back(CurNode); + } else { + ReusableBranches.push_back(CurNode); + Worklist.push_back(CurNode->getLeft()); + Worklist.push_back(CurNode->getRight()); + } + } +} + +Node *TreeHeightReduction::constructOptimizedSubtree( + std::vector &Leaves, std::vector &ReusableBranches) { + while (Leaves.size() > 1) { + llvm::stable_sort(Leaves, [](Node *LHS, Node *RHS) -> bool { + if (LHS->getLatency() != RHS->getLatency()) + return LHS->getLatency() < RHS->getLatency(); + if (LHS->getTotalCost() != RHS->getTotalCost()) + return LHS->getTotalCost() < RHS->getTotalCost(); + return false; + }); + LLVM_DEBUG(printLeaves(dbgs(), Leaves, true)); + + Node *Op1 = Leaves[0], *Op2 = Leaves[1]; + Leaves.erase(Leaves.begin(), Leaves.begin() + 2); + combineLeaves(Leaves, Op1, Op2, ReusableBranches); + + LLVM_DEBUG(printLeaves(dbgs(), Leaves, false)); + } + + return Leaves.front(); +} + +void TreeHeightReduction::combineLeaves(std::vector &Leaves, Node *Op1, + Node *Op2, + std::vector &ReusableBranches) { + Node *N = ReusableBranches.back(); + ReusableBranches.pop_back(); + + // Update the parent-child relationship. + N->setLeft(Op1); + N->setRight(Op2); + Op1->setParent(N); + Op2->setParent(N); + + N->updateTreeLatency(TTI); + Leaves.push_back(N); +} + +static std::vector getBranches(Node *N) { + std::vector Nodes = getNodesByBFS(N); + + // Exclude leaves. + for (auto Iter = Nodes.begin(); Iter != Nodes.end(); ++Iter) + if ((*Iter)->isLeaf()) { + Nodes.erase(Iter); + --Iter; + } + + return std::move(Nodes); +} + +static void setLeftAndRightOperand(Value *&LeftOp, Value *&RightOp, Node *N, + std::queue &Ops) { + auto frontPopVal = [](std::queue &Ops) -> Value * { + Value *V = Ops.front(); + Ops.pop(); + return V; + }; + + Node *L = N->getLeft(); + Node *R = N->getRight(); + if (L->isLeaf() && R->isLeaf()) { + // Both the left child and the right child of 'N' are leaves. + LeftOp = L->getDefinedValue(); + RightOp = R->getDefinedValue(); + } else { + assert(!Ops.empty()); + if (L->isLeaf()) { + // The left child is a leaf and the right child is a branch. + LeftOp = L->getDefinedValue(); + RightOp = frontPopVal(Ops); + } else if (R->isLeaf()) { + // The left child is a branch and the right child is a leaf. + LeftOp = frontPopVal(Ops); + RightOp = R->getDefinedValue(); + } else { + // Both the left child and the right child of 'N' are branches. + LeftOp = frontPopVal(Ops); + RightOp = frontPopVal(Ops); + } + } +} + +void TreeHeightReduction::createIRs(Node *Root, + std::set &GeneratedInsts) { + Instruction *RootInst = Root->getOrgInst(); + IRBuilder<> Builder(RootInst); + + std::vector Nodes = getBranches(Root); + std::queue Ops; + while (!Nodes.empty()) { + Node *N = Nodes.back(); + + Value *LeftOp = nullptr, *RightOp = nullptr; + setLeftAndRightOperand(LeftOp, RightOp, N, Ops); + + Value *NewNodeValue = createInst(Builder, N, LeftOp, RightOp); + Ops.push(NewNodeValue); + GeneratedInsts.insert(dyn_cast(NewNodeValue)); + + Nodes.pop_back(); + } + + assert(Ops.size() == 1 && "The size of queue should be 1."); + Value *NewRootValue = Ops.front(); + GeneratedInsts.insert(dyn_cast(NewRootValue)); + RootInst->replaceAllUsesWith(NewRootValue); +} + +Value *TreeHeightReduction::createInst(IRBuilder<> &Builder, Node *N, + Value *Op1, Value *Op2) { + Value *V; + switch (N->getOrgInst()->getOpcode()) { + case Instruction::Add: + V = Builder.CreateAdd(Op1, Op2, Twine("thr.add")); + break; + case Instruction::Mul: + V = Builder.CreateMul(Op1, Op2, Twine("thr.mul")); + break; + case Instruction::And: + V = Builder.CreateAnd(Op1, Op2, Twine("thr.and")); + break; + case Instruction::Or: + V = Builder.CreateOr(Op1, Op2, Twine("thr.or")); + break; + case Instruction::Xor: + V = Builder.CreateXor(Op1, Op2, Twine("thr.xor")); + break; + case Instruction::FAdd: + V = Builder.CreateFAdd(Op1, Op2, Twine("thr.fadd")); + break; + case Instruction::FMul: + V = Builder.CreateFMul(Op1, Op2, Twine("thr.fmul")); + break; + default: + llvm_unreachable("Unexpected operation code"); + } + + // Take over the original instruction IR flags. + // V may have been folded to Constant if Op1 and Op2 are both Constant. + if (Instruction *NewInst = dyn_cast(V)) + NewInst->copyIRFlags(N->getDefinedValue()); + + return V; +} + +PreservedAnalyses TreeHeightReductionPass::run(Loop &L, LoopAnalysisManager &AM, + LoopStandardAnalysisResults &AR, + LPMUpdater &U) { + // Tree height reduction is applied only to inner-most loops. + if (!L.isInnermost()) + return PreservedAnalyses::all(); + + OptimizationRemarkEmitter ORE(L.getHeader()->getParent()); + bool Changed = false; + auto &LoopBlocks = L.getBlocksVector(); + + if (EnableIntTHR) { + auto THR = TreeHeightReduction(TreeHeightReduction::InstTy::INTEGER, + &AR.TTI, &ORE); + for (auto *BB : LoopBlocks) + Changed |= THR.runOnBasicBlock(BB); + } + + if (EnableFpTHR) { + auto THR = TreeHeightReduction(TreeHeightReduction::InstTy::FLOATING_POINT, + &AR.TTI, &ORE); + for (auto *BB : LoopBlocks) + Changed |= THR.runOnBasicBlock(BB); + } + + if (!Changed) + return PreservedAnalyses::all(); + return getLoopPassPreservedAnalyses(); +} Index: llvm/test/Other/new-pm-defaults.ll =================================================================== --- llvm/test/Other/new-pm-defaults.ll +++ llvm/test/Other/new-pm-defaults.ll @@ -260,6 +260,9 @@ ; CHECK-O-NEXT: Running pass: LCSSAPass ; CHECK-O-NEXT: Running pass: LICMPass ; CHECK-O-NEXT: Running pass: AlignmentFromAssumptionsPass +; CHECK-O-NEXT: Running pass: LoopSimplifyPass +; CHECK-O-NEXT: Running pass: LCSSAPass +; CHECK-O-NEXT: Running pass: TreeHeightReductionPass ; CHECK-O-NEXT: Running pass: LoopSinkPass ; CHECK-O-NEXT: Running pass: InstSimplifyPass ; CHECK-O-NEXT: Running pass: DivRemPairsPass Index: llvm/test/Other/new-pm-thinlto-defaults.ll =================================================================== --- llvm/test/Other/new-pm-thinlto-defaults.ll +++ llvm/test/Other/new-pm-thinlto-defaults.ll @@ -224,6 +224,9 @@ ; CHECK-POSTLINK-O-NEXT: Running pass: LCSSAPass ; CHECK-POSTLINK-O-NEXT: Running pass: LICMPass ; CHECK-POSTLINK-O-NEXT: Running pass: AlignmentFromAssumptionsPass +; CHECK-POSTLINK-O-NEXT: Running pass: LoopSimplifyPass +; CHECK-POSTLINK-O-NEXT: Running pass: LCSSAPass +; CHECK-POSTLINK-O-NEXT: Running pass: TreeHeightReductionPass ; CHECK-POSTLINK-O-NEXT: Running pass: LoopSinkPass ; CHECK-POSTLINK-O-NEXT: Running pass: InstSimplifyPass ; CHECK-POSTLINK-O-NEXT: Running pass: DivRemPairsPass Index: llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll =================================================================== --- llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll +++ llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll @@ -193,6 +193,9 @@ ; CHECK-O-NEXT: Running pass: LCSSAPass ; CHECK-O-NEXT: Running pass: LICMPass ; CHECK-O-NEXT: Running pass: AlignmentFromAssumptionsPass +; CHECK-O-NEXT: Running pass: LoopSimplifyPass +; CHECK-O-NEXT: Running pass: LCSSAPass +; CHECK-O-NEXT: Running pass: TreeHeightReductionPass ; CHECK-O-NEXT: Running pass: LoopSinkPass ; CHECK-O-NEXT: Running pass: InstSimplifyPass ; CHECK-O-NEXT: Running pass: DivRemPairsPass Index: llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll =================================================================== --- llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll +++ llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll @@ -205,6 +205,9 @@ ; CHECK-O-NEXT: Running pass: LCSSAPass ; CHECK-O-NEXT: Running pass: LICMPass ; CHECK-O-NEXT: Running pass: AlignmentFromAssumptionsPass +; CHECK-O-NEXT: Running pass: LoopSimplifyPass +; CHECK-O-NEXT: Running pass: LCSSAPass +; CHECK-O-NEXT: Running pass: TreeHeightReductionPass ; CHECK-O-NEXT: Running pass: LoopSinkPass ; CHECK-O-NEXT: Running pass: InstSimplifyPass ; CHECK-O-NEXT: Running pass: DivRemPairsPass Index: llvm/test/Transforms/TreeHeightReduction/floating-point-add-only.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/TreeHeightReduction/floating-point-add-only.ll @@ -0,0 +1,105 @@ +; RUN: opt -S -passes=tree-height-reduction -enable-fp-thr < %s | FileCheck %s + +; CHECK-LABEL: @add_float( +; CHECK: %[[V0:.*]] = fadd reassoc nsz float {{.*}}, {{.*}} +; CHECK-NEXT: %[[V1:.*]] = fadd reassoc nsz float {{.*}}, {{.*}} +; CHECK-NEXT: %[[V2:.*]] = fadd reassoc nsz float {{.*}}, {{.*}} +; CHECK-NEXT: %[[V3:.*]] = fadd reassoc nsz float {{.*}}, {{.*}} +; CHECK-NEXT: %[[V4:.*]] = fadd reassoc nsz float %[[V0]], %[[V1]] +; CHECK-NEXT: %[[V5:.*]] = fadd reassoc nsz float %[[V2]], %[[V3]] +; CHECK-NEXT: fadd reassoc nsz float %[[V4]], %[[V5]] +define void @add_float(ptr noalias %A, ptr noalias %B, ptr noalias %C, ptr noalias %D, ptr noalias %E, ptr noalias %F, ptr noalias %G, ptr noalias %H, ptr noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds float, ptr %B, i64 %indvars.iv + %0 = load float, ptr %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds float, ptr %C, i64 %indvars.iv + %1 = load float, ptr %arrayidx.2, align 4 + %2 = fadd reassoc nsz float %1, %0 + %arrayidx.3 = getelementptr inbounds float, ptr %D, i64 %indvars.iv + %3 = load float, ptr %arrayidx.3, align 4 + %4 = fadd reassoc nsz float %2, %3 + %arrayidx.4 = getelementptr inbounds float, ptr %E, i64 %indvars.iv + %5 = load float, ptr %arrayidx.4, align 4 + %6 = fadd reassoc nsz float %4, %5 + %arrayidx.5 = getelementptr inbounds float, ptr %F, i64 %indvars.iv + %7 = load float, ptr %arrayidx.5, align 4 + %8 = fadd reassoc nsz float %6, %7 + %arrayidx.6 = getelementptr inbounds float, ptr %G, i64 %indvars.iv + %9 = load float, ptr %arrayidx.6, align 4 + %10 = fadd reassoc nsz float %8, %9 + %arrayidx.7 = getelementptr inbounds float, ptr %H, i64 %indvars.iv + %11 = load float, ptr %arrayidx.7, align 4 + %12 = fadd reassoc nsz float %10, %11 + %arrayidx.8 = getelementptr inbounds float, ptr %I, i64 %indvars.iv + %13 = load float, ptr %arrayidx.8, align 4 + %14 = fadd reassoc nsz float %12, %13 + %arrayidx.9 = getelementptr inbounds float, ptr %A, i64 %indvars.iv + store float %14, ptr %arrayidx.9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +; CHECK-LABEL: @add_double( +; CHECK: %[[V0:.*]] = fadd reassoc nsz double {{.*}}, {{.*}} +; CHECK-NEXT: %[[V1:.*]] = fadd reassoc nsz double {{.*}}, {{.*}} +; CHECK-NEXT: %[[V2:.*]] = fadd reassoc nsz double {{.*}}, {{.*}} +; CHECK-NEXT: %[[V3:.*]] = fadd reassoc nsz double {{.*}}, {{.*}} +; CHECK-NEXT: %[[V4:.*]] = fadd reassoc nsz double %[[V0]], %[[V1]] +; CHECK-NEXT: %[[V5:.*]] = fadd reassoc nsz double %[[V2]], %[[V3]] +; CHECK-NEXT: fadd reassoc nsz double %[[V4]], %[[V5]] +define void @add_double(ptr noalias %A, ptr noalias %B, ptr noalias %C, ptr noalias %D, ptr noalias %E, ptr noalias %F, ptr noalias %G, ptr noalias %H, ptr noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds double, ptr %B, i64 %indvars.iv + %0 = load double, ptr %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds double, ptr %C, i64 %indvars.iv + %1 = load double, ptr %arrayidx.2, align 4 + %2 = fadd reassoc nsz double %1, %0 + %arrayidx.3 = getelementptr inbounds double, ptr %D, i64 %indvars.iv + %3 = load double, ptr %arrayidx.3, align 4 + %4 = fadd reassoc nsz double %2, %3 + %arrayidx.4 = getelementptr inbounds double, ptr %E, i64 %indvars.iv + %5 = load double, ptr %arrayidx.4, align 4 + %6 = fadd reassoc nsz double %4, %5 + %arrayidx.5 = getelementptr inbounds double, ptr %F, i64 %indvars.iv + %7 = load double, ptr %arrayidx.5, align 4 + %8 = fadd reassoc nsz double %6, %7 + %arrayidx.6 = getelementptr inbounds double, ptr %G, i64 %indvars.iv + %9 = load double, ptr %arrayidx.6, align 4 + %10 = fadd reassoc nsz double %8, %9 + %arrayidx.7 = getelementptr inbounds double, ptr %H, i64 %indvars.iv + %11 = load double, ptr %arrayidx.7, align 4 + %12 = fadd reassoc nsz double %10, %11 + %arrayidx.8 = getelementptr inbounds double, ptr %I, i64 %indvars.iv + %13 = load double, ptr %arrayidx.8, align 4 + %14 = fadd reassoc nsz double %12, %13 + %arrayidx.9 = getelementptr inbounds double, ptr %A, i64 %indvars.iv + store double %14, ptr %arrayidx.9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} Index: llvm/test/Transforms/TreeHeightReduction/floating-point-add-with-constant.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/TreeHeightReduction/floating-point-add-with-constant.ll @@ -0,0 +1,109 @@ +; RUN: opt -S -passes=tree-height-reduction -enable-fp-thr < %s | FileCheck %s + +; CHECK-LABEL: @add_float_with_constant( +; CHECK: %[[V0:.*]] = fadd reassoc nsz float {{.*}}, {{.*}} +; CHECK-NEXT: %[[V1:.*]] = fadd reassoc nsz float {{.*}}, %[[V0]] +; CHECK-NEXT: %[[V2:.*]] = fadd reassoc nsz float {{.*}}, 1.000000e+01 +; CHECK-NEXT: %[[V3:.*]] = fadd reassoc nsz float {{.*}}, {{.*}} +; CHECK-NEXT: %[[V4:.*]] = fadd reassoc nsz float {{.*}}, {{.*}} +; CHECK-NEXT: %[[V5:.*]] = fadd reassoc nsz float %[[V1]], %[[V2]] +; CHECK-NEXT: %[[V6:.*]] = fadd reassoc nsz float %[[V3]], %[[V4]] +; CHECK-NEXT: fadd reassoc nsz float %[[V5]], %[[V6]] +define void @add_float_with_constant(ptr noalias %A, ptr noalias %B, ptr noalias %C, ptr noalias %D, ptr noalias %E, ptr noalias %F, ptr noalias %G, ptr noalias %H, ptr noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds float, ptr %B, i64 %indvars.iv + %0 = load float, ptr %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds float, ptr %C, i64 %indvars.iv + %1 = load float, ptr %arrayidx.2, align 4 + %arrayidx.3 = getelementptr inbounds float, ptr %D, i64 %indvars.iv + %2 = load float, ptr %arrayidx.3, align 4 + %arrayidx.4 = getelementptr inbounds float, ptr %E, i64 %indvars.iv + %3 = load float, ptr %arrayidx.4, align 4 + %arrayidx.5 = getelementptr inbounds float, ptr %F, i64 %indvars.iv + %4 = load float, ptr %arrayidx.5, align 4 + %arrayidx.6 = getelementptr inbounds float, ptr %G, i64 %indvars.iv + %5 = load float, ptr %arrayidx.6, align 4 + %arrayidx.7 = getelementptr inbounds float, ptr %H, i64 %indvars.iv + %6 = load float, ptr %arrayidx.7, align 4 + %arrayidx.8 = getelementptr inbounds float, ptr %I, i64 %indvars.iv + %7 = load float, ptr %arrayidx.8, align 4 + %8 = fadd reassoc nsz float %0, 1.000000e+01 + %9 = fadd reassoc nsz float %8, %1 + %10 = fadd reassoc nsz float %9, %2 + %11 = fadd reassoc nsz float %10, %3 + %12 = fadd reassoc nsz float %11, %4 + %13 = fadd reassoc nsz float %12, %5 + %14 = fadd reassoc nsz float %13, %6 + %15 = fadd reassoc nsz float %14, %7 + %arrayidx.9 = getelementptr inbounds float, ptr %A, i64 %indvars.iv + store float %15, ptr %arrayidx.9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +; CHECK-LABEL: @add_double_with_constant( +; CHECK: %[[V0:.*]] = fadd reassoc nsz double {{.*}}, {{.*}} +; CHECK-NEXT: %[[V1:.*]] = fadd reassoc nsz double {{.*}}, %[[V0]] +; CHECK-NEXT: %[[V2:.*]] = fadd reassoc nsz double {{.*}}, 1.000000e+01 +; CHECK-NEXT: %[[V3:.*]] = fadd reassoc nsz double {{.*}}, {{.*}} +; CHECK-NEXT: %[[V4:.*]] = fadd reassoc nsz double {{.*}}, {{.*}} +; CHECK-NEXT: %[[V5:.*]] = fadd reassoc nsz double %[[V1]], %[[V2]] +; CHECK-NEXT: %[[V6:.*]] = fadd reassoc nsz double %[[V3]], %[[V4]] +; CHECK-NEXT: fadd reassoc nsz double %[[V5]], %[[V6]] +define void @add_double_with_constant(ptr noalias %A, ptr noalias %B, ptr noalias %C, ptr noalias %D, ptr noalias %E, ptr noalias %F, ptr noalias %G, ptr noalias %H, ptr noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds double, ptr %B, i64 %indvars.iv + %0 = load double, ptr %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds double, ptr %C, i64 %indvars.iv + %1 = load double, ptr %arrayidx.2, align 4 + %arrayidx.3 = getelementptr inbounds double, ptr %D, i64 %indvars.iv + %2 = load double, ptr %arrayidx.3, align 4 + %arrayidx.4 = getelementptr inbounds double, ptr %E, i64 %indvars.iv + %3 = load double, ptr %arrayidx.4, align 4 + %arrayidx.5 = getelementptr inbounds double, ptr %F, i64 %indvars.iv + %4 = load double, ptr %arrayidx.5, align 4 + %arrayidx.6 = getelementptr inbounds double, ptr %G, i64 %indvars.iv + %5 = load double, ptr %arrayidx.6, align 4 + %arrayidx.7 = getelementptr inbounds double, ptr %H, i64 %indvars.iv + %6 = load double, ptr %arrayidx.7, align 4 + %arrayidx.8 = getelementptr inbounds double, ptr %I, i64 %indvars.iv + %7 = load double, ptr %arrayidx.8, align 4 + %8 = fadd reassoc nsz double %0, 1.000000e+01 + %9 = fadd reassoc nsz double %8, %1 + %10 = fadd reassoc nsz double %9, %2 + %11 = fadd reassoc nsz double %10, %3 + %12 = fadd reassoc nsz double %11, %4 + %13 = fadd reassoc nsz double %12, %5 + %14 = fadd reassoc nsz double %13, %6 + %15 = fadd reassoc nsz double %14, %7 + %arrayidx.9 = getelementptr inbounds double, ptr %A, i64 %indvars.iv + store double %15, ptr %arrayidx.9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} Index: llvm/test/Transforms/TreeHeightReduction/floating-point-mult-only.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/TreeHeightReduction/floating-point-mult-only.ll @@ -0,0 +1,105 @@ +; RUN: opt -S -passes=tree-height-reduction -enable-fp-thr < %s | FileCheck %s + +; CHECK-LABEL: @add_float( +; CHECK: %[[V0:.*]] = fmul reassoc nsz float {{.*}}, {{.*}} +; CHECK-NEXT: %[[V1:.*]] = fmul reassoc nsz float {{.*}}, {{.*}} +; CHECK-NEXT: %[[V2:.*]] = fmul reassoc nsz float {{.*}}, {{.*}} +; CHECK-NEXT: %[[V3:.*]] = fmul reassoc nsz float {{.*}}, {{.*}} +; CHECK-NEXT: %[[V4:.*]] = fmul reassoc nsz float %[[V0]], %[[V1]] +; CHECK-NEXT: %[[V5:.*]] = fmul reassoc nsz float %[[V2]], %[[V3]] +; CHECK-NEXT: fmul reassoc nsz float %[[V4]], %[[V5]] +define void @add_float(ptr noalias %A, ptr noalias %B, ptr noalias %C, ptr noalias %D, ptr noalias %E, ptr noalias %F, ptr noalias %G, ptr noalias %H, ptr noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds float, ptr %B, i64 %indvars.iv + %0 = load float, ptr %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds float, ptr %C, i64 %indvars.iv + %1 = load float, ptr %arrayidx.2, align 4 + %2 = fmul reassoc nsz float %1, %0 + %arrayidx.3 = getelementptr inbounds float, ptr %D, i64 %indvars.iv + %3 = load float, ptr %arrayidx.3, align 4 + %4 = fmul reassoc nsz float %2, %3 + %arrayidx.4 = getelementptr inbounds float, ptr %E, i64 %indvars.iv + %5 = load float, ptr %arrayidx.4, align 4 + %6 = fmul reassoc nsz float %4, %5 + %arrayidx.5 = getelementptr inbounds float, ptr %F, i64 %indvars.iv + %7 = load float, ptr %arrayidx.5, align 4 + %8 = fmul reassoc nsz float %6, %7 + %arrayidx.6 = getelementptr inbounds float, ptr %G, i64 %indvars.iv + %9 = load float, ptr %arrayidx.6, align 4 + %10 = fmul reassoc nsz float %8, %9 + %arrayidx.7 = getelementptr inbounds float, ptr %H, i64 %indvars.iv + %11 = load float, ptr %arrayidx.7, align 4 + %12 = fmul reassoc nsz float %10, %11 + %arrayidx.8 = getelementptr inbounds float, ptr %I, i64 %indvars.iv + %13 = load float, ptr %arrayidx.8, align 4 + %14 = fmul reassoc nsz float %12, %13 + %arrayidx.9 = getelementptr inbounds float, ptr %A, i64 %indvars.iv + store float %14, ptr %arrayidx.9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +; CHECK-LABEL: @add_double( +; CHECK: %[[V0:.*]] = fmul reassoc nsz double {{.*}}, {{.*}} +; CHECK-NEXT: %[[V1:.*]] = fmul reassoc nsz double {{.*}}, {{.*}} +; CHECK-NEXT: %[[V2:.*]] = fmul reassoc nsz double {{.*}}, {{.*}} +; CHECK-NEXT: %[[V3:.*]] = fmul reassoc nsz double {{.*}}, {{.*}} +; CHECK-NEXT: %[[V4:.*]] = fmul reassoc nsz double %[[V0]], %[[V1]] +; CHECK-NEXT: %[[V5:.*]] = fmul reassoc nsz double %[[V2]], %[[V3]] +; CHECK-NEXT: fmul reassoc nsz double %[[V4]], %[[V5]] +define void @add_double(ptr noalias %A, ptr noalias %B, ptr noalias %C, ptr noalias %D, ptr noalias %E, ptr noalias %F, ptr noalias %G, ptr noalias %H, ptr noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds double, ptr %B, i64 %indvars.iv + %0 = load double, ptr %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds double, ptr %C, i64 %indvars.iv + %1 = load double, ptr %arrayidx.2, align 4 + %2 = fmul reassoc nsz double %1, %0 + %arrayidx.3 = getelementptr inbounds double, ptr %D, i64 %indvars.iv + %3 = load double, ptr %arrayidx.3, align 4 + %4 = fmul reassoc nsz double %2, %3 + %arrayidx.4 = getelementptr inbounds double, ptr %E, i64 %indvars.iv + %5 = load double, ptr %arrayidx.4, align 4 + %6 = fmul reassoc nsz double %4, %5 + %arrayidx.5 = getelementptr inbounds double, ptr %F, i64 %indvars.iv + %7 = load double, ptr %arrayidx.5, align 4 + %8 = fmul reassoc nsz double %6, %7 + %arrayidx.6 = getelementptr inbounds double, ptr %G, i64 %indvars.iv + %9 = load double, ptr %arrayidx.6, align 4 + %10 = fmul reassoc nsz double %8, %9 + %arrayidx.7 = getelementptr inbounds double, ptr %H, i64 %indvars.iv + %11 = load double, ptr %arrayidx.7, align 4 + %12 = fmul reassoc nsz double %10, %11 + %arrayidx.8 = getelementptr inbounds double, ptr %I, i64 %indvars.iv + %13 = load double, ptr %arrayidx.8, align 4 + %14 = fmul reassoc nsz double %12, %13 + %arrayidx.9 = getelementptr inbounds double, ptr %A, i64 %indvars.iv + store double %14, ptr %arrayidx.9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} Index: llvm/test/Transforms/TreeHeightReduction/floating-point-sub-only.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/TreeHeightReduction/floating-point-sub-only.ll @@ -0,0 +1,105 @@ +; RUN: opt -S -passes=tree-height-reduction -enable-fp-thr < %s | FileCheck %s + +; CHECK-LABEL: @sub_float( +; CHECK: %[[V0:.*]] = fsub reassoc nsz float {{.*}}, {{.*}} +; CHECK: %[[V1:.*]] = fsub reassoc nsz float %[[V0]], %{{.*}} +; CHECK: %[[V2:.*]] = fsub reassoc nsz float %[[V1]], {{.*}} +; CHECK: %[[V3:.*]] = fsub reassoc nsz float %[[V2]], {{.*}} +; CHECK: %[[V4:.*]] = fsub reassoc nsz float %[[V3]], {{.*}} +; CHECK: %[[V5:.*]] = fsub reassoc nsz float %[[V4]], {{.*}} +; CHECK: fsub reassoc nsz float %[[V5]], {{.*}} +define void @sub_float(ptr noalias %A, ptr noalias %B, ptr noalias %C, ptr noalias %D, ptr noalias %E, ptr noalias %F, ptr noalias %G, ptr noalias %H, ptr noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds float, ptr %B, i64 %indvars.iv + %0 = load float, ptr %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds float, ptr %C, i64 %indvars.iv + %1 = load float, ptr %arrayidx.2, align 4 + %2 = fsub reassoc nsz float %0, %1 + %arrayidx.3 = getelementptr inbounds float, ptr %D, i64 %indvars.iv + %3 = load float, ptr %arrayidx.3, align 4 + %4 = fsub reassoc nsz float %2, %3 + %arrayidx.4 = getelementptr inbounds float, ptr %E, i64 %indvars.iv + %5 = load float, ptr %arrayidx.4, align 4 + %6 = fsub reassoc nsz float %4, %5 + %arrayidx.5 = getelementptr inbounds float, ptr %F, i64 %indvars.iv + %7 = load float, ptr %arrayidx.5, align 4 + %8 = fsub reassoc nsz float %6, %7 + %arrayidx.6 = getelementptr inbounds float, ptr %G, i64 %indvars.iv + %9 = load float, ptr %arrayidx.6, align 4 + %10 = fsub reassoc nsz float %8, %9 + %arrayidx.7 = getelementptr inbounds float, ptr %H, i64 %indvars.iv + %11 = load float, ptr %arrayidx.7, align 4 + %12 = fsub reassoc nsz float %10, %11 + %arrayidx.8 = getelementptr inbounds float, ptr %I, i64 %indvars.iv + %13 = load float, ptr %arrayidx.8, align 4 + %14 = fsub reassoc nsz float %12, %13 + %arrayidx.9 = getelementptr inbounds float, ptr %A, i64 %indvars.iv + store float %14, ptr %arrayidx.9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +; CHECK-LABEL: @sub_double( +; CHECK: %[[V0:.*]] = fsub reassoc nsz double {{.*}}, {{.*}} +; CHECK: %[[V1:.*]] = fsub reassoc nsz double %[[V0]], %{{.*}} +; CHECK: %[[V2:.*]] = fsub reassoc nsz double %[[V1]], {{.*}} +; CHECK: %[[V3:.*]] = fsub reassoc nsz double %[[V2]], {{.*}} +; CHECK: %[[V4:.*]] = fsub reassoc nsz double %[[V3]], {{.*}} +; CHECK: %[[V5:.*]] = fsub reassoc nsz double %[[V4]], {{.*}} +; CHECK: fsub reassoc nsz double %[[V5]], {{.*}} +define void @sub_double(ptr noalias %A, ptr noalias %B, ptr noalias %C, ptr noalias %D, ptr noalias %E, ptr noalias %F, ptr noalias %G, ptr noalias %H, ptr noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds double, ptr %B, i64 %indvars.iv + %0 = load double, ptr %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds double, ptr %C, i64 %indvars.iv + %1 = load double, ptr %arrayidx.2, align 4 + %2 = fsub reassoc nsz double %0, %1 + %arrayidx.3 = getelementptr inbounds double, ptr %D, i64 %indvars.iv + %3 = load double, ptr %arrayidx.3, align 4 + %4 = fsub reassoc nsz double %2, %3 + %arrayidx.4 = getelementptr inbounds double, ptr %E, i64 %indvars.iv + %5 = load double, ptr %arrayidx.4, align 4 + %6 = fsub reassoc nsz double %4, %5 + %arrayidx.5 = getelementptr inbounds double, ptr %F, i64 %indvars.iv + %7 = load double, ptr %arrayidx.5, align 4 + %8 = fsub reassoc nsz double %6, %7 + %arrayidx.6 = getelementptr inbounds double, ptr %G, i64 %indvars.iv + %9 = load double, ptr %arrayidx.6, align 4 + %10 = fsub reassoc nsz double %8, %9 + %arrayidx.7 = getelementptr inbounds double, ptr %H, i64 %indvars.iv + %11 = load double, ptr %arrayidx.7, align 4 + %12 = fsub reassoc nsz double %10, %11 + %arrayidx.8 = getelementptr inbounds double, ptr %I, i64 %indvars.iv + %13 = load double, ptr %arrayidx.8, align 4 + %14 = fsub reassoc nsz double %12, %13 + %arrayidx.9 = getelementptr inbounds double, ptr %A, i64 %indvars.iv + store double %14, ptr %arrayidx.9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} Index: llvm/test/Transforms/TreeHeightReduction/fp16-add-with-constant.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/TreeHeightReduction/fp16-add-with-constant.ll @@ -0,0 +1,55 @@ +; RUN: opt -S -passes=tree-height-reduction -enable-fp-thr < %s | FileCheck %s + +; CHECK-LABEL: @add_half_with_constant( +; CHECK: %[[V0:.*]] = fadd reassoc nsz half {{.*}}, {{.*}} +; CHECK-NEXT: %[[V1:.*]] = fadd reassoc nsz half {{.*}}, %[[V0]] +; CHECK-NEXT: %[[V2:.*]] = fadd reassoc nsz half {{.*}}, 0xH4900 +; CHECK-NEXT: %[[V3:.*]] = fadd reassoc nsz half {{.*}}, {{.*}} +; CHECK-NEXT: %[[V4:.*]] = fadd reassoc nsz half {{.*}}, {{.*}} +; CHECK-NEXT: %[[V5:.*]] = fadd reassoc nsz half %[[V1]], %[[V2]] +; CHECK-NEXT: %[[V6:.*]] = fadd reassoc nsz half %[[V3]], %[[V4]] +; CHECK-NEXT: fadd reassoc nsz half %[[V5]], %[[V6]] +define void @add_half_with_constant(ptr noalias %A, ptr noalias %B, ptr noalias %C, ptr noalias %D, ptr noalias %E, ptr noalias %F, ptr noalias %G, ptr noalias %H, ptr noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds half, ptr %B, i64 %indvars.iv + %0 = load half, ptr %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds half, ptr %C, i64 %indvars.iv + %1 = load half, ptr %arrayidx.2, align 4 + %arrayidx.3 = getelementptr inbounds half, ptr %D, i64 %indvars.iv + %2 = load half, ptr %arrayidx.3, align 4 + %arrayidx.4 = getelementptr inbounds half, ptr %E, i64 %indvars.iv + %3 = load half, ptr %arrayidx.4, align 4 + %arrayidx.5 = getelementptr inbounds half, ptr %F, i64 %indvars.iv + %4 = load half, ptr %arrayidx.5, align 4 + %arrayidx.6 = getelementptr inbounds half, ptr %G, i64 %indvars.iv + %5 = load half, ptr %arrayidx.6, align 4 + %arrayidx.7 = getelementptr inbounds half, ptr %H, i64 %indvars.iv + %6 = load half, ptr %arrayidx.7, align 4 + %arrayidx.8 = getelementptr inbounds half, ptr %I, i64 %indvars.iv + %7 = load half, ptr %arrayidx.8, align 4 + %8 = fadd reassoc nsz half %0, 0xH4900 + %9 = fadd reassoc nsz half %8, %1 + %10 = fadd reassoc nsz half %9, %2 + %11 = fadd reassoc nsz half %10, %3 + %12 = fadd reassoc nsz half %11, %4 + %13 = fadd reassoc nsz half %12, %5 + %14 = fadd reassoc nsz half %13, %6 + %15 = fadd reassoc nsz half %14, %7 + %arrayidx.9 = getelementptr inbounds half, ptr %A, i64 %indvars.iv + store half %15, ptr %arrayidx.9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} Index: llvm/test/Transforms/TreeHeightReduction/fp16-add.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/TreeHeightReduction/fp16-add.ll @@ -0,0 +1,53 @@ +; RUN: opt -S -passes=tree-height-reduction -enable-fp-thr < %s | FileCheck %s + +; CHECK-LABEL: @add_half( +; CHECK: %[[V0:.*]] = fadd reassoc nsz half {{.*}}, {{.*}} +; CHECK-NEXT: %[[V1:.*]] = fadd reassoc nsz half {{.*}}, {{.*}} +; CHECK-NEXT: %[[V2:.*]] = fadd reassoc nsz half {{.*}}, {{.*}} +; CHECK-NEXT: %[[V3:.*]] = fadd reassoc nsz half {{.*}}, {{.*}} +; CHECK-NEXT: %[[V4:.*]] = fadd reassoc nsz half %[[V0]], %[[V1]] +; CHECK-NEXT: %[[V5:.*]] = fadd reassoc nsz half %[[V2]], %[[V3]] +; CHECK-NEXT: fadd reassoc nsz half %[[V4]], %[[V5]] +define void @add_half(ptr noalias %A, ptr noalias %B, ptr noalias %C, ptr noalias %D, ptr noalias %E, ptr noalias %F, ptr noalias %G, ptr noalias %H, ptr noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds half, ptr %B, i64 %indvars.iv + %0 = load half, ptr %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds half, ptr %C, i64 %indvars.iv + %1 = load half, ptr %arrayidx.2, align 4 + %2 = fadd reassoc nsz half %1, %0 + %arrayidx.3 = getelementptr inbounds half, ptr %D, i64 %indvars.iv + %3 = load half, ptr %arrayidx.3, align 4 + %4 = fadd reassoc nsz half %2, %3 + %arrayidx.4 = getelementptr inbounds half, ptr %E, i64 %indvars.iv + %5 = load half, ptr %arrayidx.4, align 4 + %6 = fadd reassoc nsz half %4, %5 + %arrayidx.5 = getelementptr inbounds half, ptr %F, i64 %indvars.iv + %7 = load half, ptr %arrayidx.5, align 4 + %8 = fadd reassoc nsz half %6, %7 + %arrayidx.6 = getelementptr inbounds half, ptr %G, i64 %indvars.iv + %9 = load half, ptr %arrayidx.6, align 4 + %10 = fadd reassoc nsz half %8, %9 + %arrayidx.7 = getelementptr inbounds half, ptr %H, i64 %indvars.iv + %11 = load half, ptr %arrayidx.7, align 4 + %12 = fadd reassoc nsz half %10, %11 + %arrayidx.8 = getelementptr inbounds half, ptr %I, i64 %indvars.iv + %13 = load half, ptr %arrayidx.8, align 4 + %14 = fadd reassoc nsz half %12, %13 + %arrayidx.9 = getelementptr inbounds half, ptr %A, i64 %indvars.iv + store half %14, ptr %arrayidx.9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} Index: llvm/test/Transforms/TreeHeightReduction/fp16-mult.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/TreeHeightReduction/fp16-mult.ll @@ -0,0 +1,53 @@ +; RUN: opt -S -passes=tree-height-reduction -enable-fp-thr < %s | FileCheck %s + +; CHECK-LABEL: @add_half( +; CHECK: %[[V0:.*]] = fmul reassoc nsz half {{.*}}, {{.*}} +; CHECK-NEXT: %[[V1:.*]] = fmul reassoc nsz half {{.*}}, {{.*}} +; CHECK-NEXT: %[[V2:.*]] = fmul reassoc nsz half {{.*}}, {{.*}} +; CHECK-NEXT: %[[V3:.*]] = fmul reassoc nsz half {{.*}}, {{.*}} +; CHECK-NEXT: %[[V4:.*]] = fmul reassoc nsz half %[[V0]], %[[V1]] +; CHECK-NEXT: %[[V5:.*]] = fmul reassoc nsz half %[[V2]], %[[V3]] +; CHECK-NEXT: fmul reassoc nsz half %[[V4]], %[[V5]] +define void @add_half(ptr noalias %A, ptr noalias %B, ptr noalias %C, ptr noalias %D, ptr noalias %E, ptr noalias %F, ptr noalias %G, ptr noalias %H, ptr noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds half, ptr %B, i64 %indvars.iv + %0 = load half, ptr %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds half, ptr %C, i64 %indvars.iv + %1 = load half, ptr %arrayidx.2, align 4 + %2 = fmul reassoc nsz half %1, %0 + %arrayidx.3 = getelementptr inbounds half, ptr %D, i64 %indvars.iv + %3 = load half, ptr %arrayidx.3, align 4 + %4 = fmul reassoc nsz half %2, %3 + %arrayidx.4 = getelementptr inbounds half, ptr %E, i64 %indvars.iv + %5 = load half, ptr %arrayidx.4, align 4 + %6 = fmul reassoc nsz half %4, %5 + %arrayidx.5 = getelementptr inbounds half, ptr %F, i64 %indvars.iv + %7 = load half, ptr %arrayidx.5, align 4 + %8 = fmul reassoc nsz half %6, %7 + %arrayidx.6 = getelementptr inbounds half, ptr %G, i64 %indvars.iv + %9 = load half, ptr %arrayidx.6, align 4 + %10 = fmul reassoc nsz half %8, %9 + %arrayidx.7 = getelementptr inbounds half, ptr %H, i64 %indvars.iv + %11 = load half, ptr %arrayidx.7, align 4 + %12 = fmul reassoc nsz half %10, %11 + %arrayidx.8 = getelementptr inbounds half, ptr %I, i64 %indvars.iv + %13 = load half, ptr %arrayidx.8, align 4 + %14 = fmul reassoc nsz half %12, %13 + %arrayidx.9 = getelementptr inbounds half, ptr %A, i64 %indvars.iv + store half %14, ptr %arrayidx.9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} Index: llvm/test/Transforms/TreeHeightReduction/fp16-sub.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/TreeHeightReduction/fp16-sub.ll @@ -0,0 +1,105 @@ +; RUN: opt -S -passes=tree-height-reduction -enable-fp-thr < %s | FileCheck %s + +; CHECK-LABEL: @sub_half( +; CHECK: %[[V0:.*]] = fsub reassoc nsz half {{.*}}, {{.*}} +; CHECK: %[[V1:.*]] = fsub reassoc nsz half %[[V0]], %{{.*}} +; CHECK: %[[V2:.*]] = fsub reassoc nsz half %[[V1]], {{.*}} +; CHECK: %[[V3:.*]] = fsub reassoc nsz half %[[V2]], {{.*}} +; CHECK: %[[V4:.*]] = fsub reassoc nsz half %[[V3]], {{.*}} +; CHECK: %[[V5:.*]] = fsub reassoc nsz half %[[V4]], {{.*}} +; CHECK: fsub reassoc nsz half %[[V5]], {{.*}} +define void @sub_half(ptr noalias %A, ptr noalias %B, ptr noalias %C, ptr noalias %D, ptr noalias %E, ptr noalias %F, ptr noalias %G, ptr noalias %H, ptr noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds half, ptr %B, i64 %indvars.iv + %0 = load half, ptr %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds half, ptr %C, i64 %indvars.iv + %1 = load half, ptr %arrayidx.2, align 4 + %2 = fsub reassoc nsz half %0, %1 + %arrayidx.3 = getelementptr inbounds half, ptr %D, i64 %indvars.iv + %3 = load half, ptr %arrayidx.3, align 4 + %4 = fsub reassoc nsz half %2, %3 + %arrayidx.4 = getelementptr inbounds half, ptr %E, i64 %indvars.iv + %5 = load half, ptr %arrayidx.4, align 4 + %6 = fsub reassoc nsz half %4, %5 + %arrayidx.5 = getelementptr inbounds half, ptr %F, i64 %indvars.iv + %7 = load half, ptr %arrayidx.5, align 4 + %8 = fsub reassoc nsz half %6, %7 + %arrayidx.6 = getelementptr inbounds half, ptr %G, i64 %indvars.iv + %9 = load half, ptr %arrayidx.6, align 4 + %10 = fsub reassoc nsz half %8, %9 + %arrayidx.7 = getelementptr inbounds half, ptr %H, i64 %indvars.iv + %11 = load half, ptr %arrayidx.7, align 4 + %12 = fsub reassoc nsz half %10, %11 + %arrayidx.8 = getelementptr inbounds half, ptr %I, i64 %indvars.iv + %13 = load half, ptr %arrayidx.8, align 4 + %14 = fsub reassoc nsz half %12, %13 + %arrayidx.9 = getelementptr inbounds half, ptr %A, i64 %indvars.iv + store half %14, ptr %arrayidx.9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +; CHECK-LABEL: @sub_double( +; CHECK: %[[V0:.*]] = fsub reassoc nsz double {{.*}}, {{.*}} +; CHECK: %[[V1:.*]] = fsub reassoc nsz double %[[V0]], %{{.*}} +; CHECK: %[[V2:.*]] = fsub reassoc nsz double %[[V1]], {{.*}} +; CHECK: %[[V3:.*]] = fsub reassoc nsz double %[[V2]], {{.*}} +; CHECK: %[[V4:.*]] = fsub reassoc nsz double %[[V3]], {{.*}} +; CHECK: %[[V5:.*]] = fsub reassoc nsz double %[[V4]], {{.*}} +; CHECK: fsub reassoc nsz double %[[V5]], {{.*}} +define void @sub_double(ptr noalias %A, ptr noalias %B, ptr noalias %C, ptr noalias %D, ptr noalias %E, ptr noalias %F, ptr noalias %G, ptr noalias %H, ptr noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds double, ptr %B, i64 %indvars.iv + %0 = load double, ptr %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds double, ptr %C, i64 %indvars.iv + %1 = load double, ptr %arrayidx.2, align 4 + %2 = fsub reassoc nsz double %0, %1 + %arrayidx.3 = getelementptr inbounds double, ptr %D, i64 %indvars.iv + %3 = load double, ptr %arrayidx.3, align 4 + %4 = fsub reassoc nsz double %2, %3 + %arrayidx.4 = getelementptr inbounds double, ptr %E, i64 %indvars.iv + %5 = load double, ptr %arrayidx.4, align 4 + %6 = fsub reassoc nsz double %4, %5 + %arrayidx.5 = getelementptr inbounds double, ptr %F, i64 %indvars.iv + %7 = load double, ptr %arrayidx.5, align 4 + %8 = fsub reassoc nsz double %6, %7 + %arrayidx.6 = getelementptr inbounds double, ptr %G, i64 %indvars.iv + %9 = load double, ptr %arrayidx.6, align 4 + %10 = fsub reassoc nsz double %8, %9 + %arrayidx.7 = getelementptr inbounds double, ptr %H, i64 %indvars.iv + %11 = load double, ptr %arrayidx.7, align 4 + %12 = fsub reassoc nsz double %10, %11 + %arrayidx.8 = getelementptr inbounds double, ptr %I, i64 %indvars.iv + %13 = load double, ptr %arrayidx.8, align 4 + %14 = fsub reassoc nsz double %12, %13 + %arrayidx.9 = getelementptr inbounds double, ptr %A, i64 %indvars.iv + store double %14, ptr %arrayidx.9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} Index: llvm/test/Transforms/TreeHeightReduction/integer-add-only.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/TreeHeightReduction/integer-add-only.ll @@ -0,0 +1,209 @@ +; RUN: opt -S -passes=tree-height-reduction -enable-int-thr < %s | FileCheck %s + +; CHECK-LABEL: @add_i8( +; CHECK: %[[V0:.*]] = add i8 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V1:.*]] = add i8 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V2:.*]] = add i8 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V3:.*]] = add i8 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V4:.*]] = add i8 %[[V0]], %[[V1]] +; CHECK-NEXT: %[[V5:.*]] = add i8 %[[V2]], %[[V3]] +; CHECK-NEXT: add i8 %[[V4]], %[[V5]] +define void @add_i8(ptr noalias %A, ptr noalias %B, ptr noalias %C, ptr noalias %D, ptr noalias %E, ptr noalias %F, ptr noalias %G, ptr noalias %H, ptr noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds i8, ptr %B, i64 %indvars.iv + %0 = load i8, ptr %arrayidx.1, align 1 + %arrayidx.2 = getelementptr inbounds i8, ptr %C, i64 %indvars.iv + %1 = load i8, ptr %arrayidx.2, align 1 + %2 = add i8 %1, %0 + %arrayidx.3 = getelementptr inbounds i8, ptr %D, i64 %indvars.iv + %3 = load i8, ptr %arrayidx.3, align 1 + %4 = add i8 %2, %3 + %arrayidx.4 = getelementptr inbounds i8, ptr %E, i64 %indvars.iv + %5 = load i8, ptr %arrayidx.4, align 1 + %6 = add i8 %4, %5 + %arrayidx.5 = getelementptr inbounds i8, ptr %F, i64 %indvars.iv + %7 = load i8, ptr %arrayidx.5, align 1 + %8 = add i8 %6, %7 + %arrayidx.6 = getelementptr inbounds i8, ptr %G, i64 %indvars.iv + %9 = load i8, ptr %arrayidx.6, align 1 + %10 = add i8 %8, %9 + %arrayidx.7 = getelementptr inbounds i8, ptr %H, i64 %indvars.iv + %11 = load i8, ptr %arrayidx.7, align 1 + %12 = add i8 %10, %11 + %arrayidx.8 = getelementptr inbounds i8, ptr %I, i64 %indvars.iv + %13 = load i8, ptr %arrayidx.8, align 1 + %14 = add i8 %12, %13 + %arrayidx.9 = getelementptr inbounds i8, ptr %A, i64 %indvars.iv + store i8 %14, ptr %arrayidx.9, align 1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +; CHECK-LABEL: @add_i16( +; CHECK: %[[V0:.*]] = add i16 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V1:.*]] = add i16 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V2:.*]] = add i16 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V3:.*]] = add i16 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V4:.*]] = add i16 %[[V0]], %[[V1]] +; CHECK-NEXT: %[[V5:.*]] = add i16 %[[V2]], %[[V3]] +; CHECK-NEXT: add i16 %[[V4]], %[[V5]] +define void @add_i16(ptr noalias %A, ptr noalias %B, ptr noalias %C, ptr noalias %D, ptr noalias %E, ptr noalias %F, ptr noalias %G, ptr noalias %H, ptr noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds i16, ptr %B, i64 %indvars.iv + %0 = load i16, ptr %arrayidx.1, align 1 + %arrayidx.2 = getelementptr inbounds i16, ptr %C, i64 %indvars.iv + %1 = load i16, ptr %arrayidx.2, align 1 + %2 = add i16 %1, %0 + %arrayidx.3 = getelementptr inbounds i16, ptr %D, i64 %indvars.iv + %3 = load i16, ptr %arrayidx.3, align 1 + %4 = add i16 %2, %3 + %arrayidx.4 = getelementptr inbounds i16, ptr %E, i64 %indvars.iv + %5 = load i16, ptr %arrayidx.4, align 1 + %6 = add i16 %4, %5 + %arrayidx.5 = getelementptr inbounds i16, ptr %F, i64 %indvars.iv + %7 = load i16, ptr %arrayidx.5, align 1 + %8 = add i16 %6, %7 + %arrayidx.6 = getelementptr inbounds i16, ptr %G, i64 %indvars.iv + %9 = load i16, ptr %arrayidx.6, align 1 + %10 = add i16 %8, %9 + %arrayidx.7 = getelementptr inbounds i16, ptr %H, i64 %indvars.iv + %11 = load i16, ptr %arrayidx.7, align 1 + %12 = add i16 %10, %11 + %arrayidx.8 = getelementptr inbounds i16, ptr %I, i64 %indvars.iv + %13 = load i16, ptr %arrayidx.8, align 1 + %14 = add i16 %12, %13 + %arrayidx.9 = getelementptr inbounds i16, ptr %A, i64 %indvars.iv + store i16 %14, ptr %arrayidx.9, align 1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +; CHECK-LABEL: @add_i32( +; CHECK: %[[V0:.*]] = add i32 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V1:.*]] = add i32 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V2:.*]] = add i32 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V3:.*]] = add i32 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V4:.*]] = add i32 %[[V0]], %[[V1]] +; CHECK-NEXT: %[[V5:.*]] = add i32 %[[V2]], %[[V3]] +; CHECK-NEXT: add i32 %[[V4]], %[[V5]] +define void @add_i32(ptr noalias %A, ptr noalias %B, ptr noalias %C, ptr noalias %D, ptr noalias %E, ptr noalias %F, ptr noalias %G, ptr noalias %H, ptr noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds i32, ptr %B, i64 %indvars.iv + %0 = load i32, ptr %arrayidx.1, align 1 + %arrayidx.2 = getelementptr inbounds i32, ptr %C, i64 %indvars.iv + %1 = load i32, ptr %arrayidx.2, align 1 + %2 = add i32 %1, %0 + %arrayidx.3 = getelementptr inbounds i32, ptr %D, i64 %indvars.iv + %3 = load i32, ptr %arrayidx.3, align 1 + %4 = add i32 %2, %3 + %arrayidx.4 = getelementptr inbounds i32, ptr %E, i64 %indvars.iv + %5 = load i32, ptr %arrayidx.4, align 1 + %6 = add i32 %4, %5 + %arrayidx.5 = getelementptr inbounds i32, ptr %F, i64 %indvars.iv + %7 = load i32, ptr %arrayidx.5, align 1 + %8 = add i32 %6, %7 + %arrayidx.6 = getelementptr inbounds i32, ptr %G, i64 %indvars.iv + %9 = load i32, ptr %arrayidx.6, align 1 + %10 = add i32 %8, %9 + %arrayidx.7 = getelementptr inbounds i32, ptr %H, i64 %indvars.iv + %11 = load i32, ptr %arrayidx.7, align 1 + %12 = add i32 %10, %11 + %arrayidx.8 = getelementptr inbounds i32, ptr %I, i64 %indvars.iv + %13 = load i32, ptr %arrayidx.8, align 1 + %14 = add i32 %12, %13 + %arrayidx.9 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv + store i32 %14, ptr %arrayidx.9, align 1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +; CHECK-LABEL: @add_i64( +; CHECK: %[[V0:.*]] = add i64 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V1:.*]] = add i64 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V2:.*]] = add i64 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V3:.*]] = add i64 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V4:.*]] = add i64 %[[V0]], %[[V1]] +; CHECK-NEXT: %[[V5:.*]] = add i64 %[[V2]], %[[V3]] +; CHECK-NEXT: add i64 %[[V4]], %[[V5]] +define void @add_i64(ptr noalias %A, ptr noalias %B, ptr noalias %C, ptr noalias %D, ptr noalias %E, ptr noalias %F, ptr noalias %G, ptr noalias %H, ptr noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds i64, ptr %B, i64 %indvars.iv + %0 = load i64, ptr %arrayidx.1, align 1 + %arrayidx.2 = getelementptr inbounds i64, ptr %C, i64 %indvars.iv + %1 = load i64, ptr %arrayidx.2, align 1 + %2 = add i64 %1, %0 + %arrayidx.3 = getelementptr inbounds i64, ptr %D, i64 %indvars.iv + %3 = load i64, ptr %arrayidx.3, align 1 + %4 = add i64 %2, %3 + %arrayidx.4 = getelementptr inbounds i64, ptr %E, i64 %indvars.iv + %5 = load i64, ptr %arrayidx.4, align 1 + %6 = add i64 %4, %5 + %arrayidx.5 = getelementptr inbounds i64, ptr %F, i64 %indvars.iv + %7 = load i64, ptr %arrayidx.5, align 1 + %8 = add i64 %6, %7 + %arrayidx.6 = getelementptr inbounds i64, ptr %G, i64 %indvars.iv + %9 = load i64, ptr %arrayidx.6, align 1 + %10 = add i64 %8, %9 + %arrayidx.7 = getelementptr inbounds i64, ptr %H, i64 %indvars.iv + %11 = load i64, ptr %arrayidx.7, align 1 + %12 = add i64 %10, %11 + %arrayidx.8 = getelementptr inbounds i64, ptr %I, i64 %indvars.iv + %13 = load i64, ptr %arrayidx.8, align 1 + %14 = add i64 %12, %13 + %arrayidx.9 = getelementptr inbounds i64, ptr %A, i64 %indvars.iv + store i64 %14, ptr %arrayidx.9, align 1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} Index: llvm/test/Transforms/TreeHeightReduction/integer-add-with-constant.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/TreeHeightReduction/integer-add-with-constant.ll @@ -0,0 +1,217 @@ +; RUN: opt -S -passes=tree-height-reduction -enable-int-thr < %s | FileCheck %s + +; CHECK-LABEL: @add_with_constant_i8( +; CHECK: %[[V0:.*]] = add i8 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V1:.*]] = add i8 {{.*}}, %[[V0]] +; CHECK-NEXT: %[[V2:.*]] = add i8 {{.*}}, 10 +; CHECK-NEXT: %[[V3:.*]] = add i8 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V4:.*]] = add i8 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V5:.*]] = add i8 %[[V1]], %[[V2]] +; CHECK-NEXT: %[[V6:.*]] = add i8 %[[V3]], %[[V4]] +; CHECK-NEXT: add i8 %[[V5]], %[[V6]] +define void @add_with_constant_i8(ptr noalias %A, ptr noalias %B, ptr noalias %C, ptr noalias %D, ptr noalias %E, ptr noalias %F, ptr noalias %G, ptr noalias %H, ptr noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds i8, ptr %B, i64 %indvars.iv + %0 = load i8, ptr %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds i8, ptr %C, i64 %indvars.iv + %1 = load i8, ptr %arrayidx.2, align 4 + %arrayidx.3 = getelementptr inbounds i8, ptr %D, i64 %indvars.iv + %2 = load i8, ptr %arrayidx.3, align 4 + %arrayidx.4 = getelementptr inbounds i8, ptr %E, i64 %indvars.iv + %3 = load i8, ptr %arrayidx.4, align 4 + %arrayidx.5 = getelementptr inbounds i8, ptr %F, i64 %indvars.iv + %4 = load i8, ptr %arrayidx.5, align 4 + %arrayidx.6 = getelementptr inbounds i8, ptr %G, i64 %indvars.iv + %5 = load i8, ptr %arrayidx.6, align 4 + %arrayidx.7 = getelementptr inbounds i8, ptr %H, i64 %indvars.iv + %6 = load i8, ptr %arrayidx.7, align 4 + %arrayidx.8 = getelementptr inbounds i8, ptr %I, i64 %indvars.iv + %7 = load i8, ptr %arrayidx.8, align 4 + %8 = add i8 %0, 10 + %9 = add i8 %8, %1 + %10 = add i8 %9, %2 + %11 = add i8 %10, %3 + %12 = add i8 %11, %4 + %13 = add i8 %12, %5 + %14 = add i8 %13, %6 + %15 = add i8 %14, %7 + %arrayidx.9 = getelementptr inbounds i8, ptr %A, i64 %indvars.iv + store i8 %15, ptr %arrayidx.9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +; CHECK-LABEL: @add_with_constant_i16( +; CHECK: %[[V0:.*]] = add i16 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V1:.*]] = add i16 {{.*}}, %[[V0]] +; CHECK-NEXT: %[[V2:.*]] = add i16 {{.*}}, 10 +; CHECK-NEXT: %[[V3:.*]] = add i16 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V4:.*]] = add i16 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V5:.*]] = add i16 %[[V1]], %[[V2]] +; CHECK-NEXT: %[[V6:.*]] = add i16 %[[V3]], %[[V4]] +; CHECK-NEXT: add i16 %[[V5]], %[[V6]] +define void @add_with_constant_i16(ptr noalias %A, ptr noalias %B, ptr noalias %C, ptr noalias %D, ptr noalias %E, ptr noalias %F, ptr noalias %G, ptr noalias %H, ptr noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds i16, ptr %B, i64 %indvars.iv + %0 = load i16, ptr %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds i16, ptr %C, i64 %indvars.iv + %1 = load i16, ptr %arrayidx.2, align 4 + %arrayidx.3 = getelementptr inbounds i16, ptr %D, i64 %indvars.iv + %2 = load i16, ptr %arrayidx.3, align 4 + %arrayidx.4 = getelementptr inbounds i16, ptr %E, i64 %indvars.iv + %3 = load i16, ptr %arrayidx.4, align 4 + %arrayidx.5 = getelementptr inbounds i16, ptr %F, i64 %indvars.iv + %4 = load i16, ptr %arrayidx.5, align 4 + %arrayidx.6 = getelementptr inbounds i16, ptr %G, i64 %indvars.iv + %5 = load i16, ptr %arrayidx.6, align 4 + %arrayidx.7 = getelementptr inbounds i16, ptr %H, i64 %indvars.iv + %6 = load i16, ptr %arrayidx.7, align 4 + %arrayidx.8 = getelementptr inbounds i16, ptr %I, i64 %indvars.iv + %7 = load i16, ptr %arrayidx.8, align 4 + %8 = add i16 %0, 10 + %9 = add i16 %8, %1 + %10 = add i16 %9, %2 + %11 = add i16 %10, %3 + %12 = add i16 %11, %4 + %13 = add i16 %12, %5 + %14 = add i16 %13, %6 + %15 = add i16 %14, %7 + %arrayidx.9 = getelementptr inbounds i16, ptr %A, i64 %indvars.iv + store i16 %15, ptr %arrayidx.9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +; CHECK-LABEL: @add_with_constant_i32( +; CHECK: %[[V0:.*]] = add i32 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V1:.*]] = add i32 {{.*}}, %[[V0]] +; CHECK-NEXT: %[[V2:.*]] = add i32 {{.*}}, 10 +; CHECK-NEXT: %[[V3:.*]] = add i32 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V4:.*]] = add i32 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V5:.*]] = add i32 %[[V1]], %[[V2]] +; CHECK-NEXT: %[[V6:.*]] = add i32 %[[V3]], %[[V4]] +; CHECK-NEXT: add i32 %[[V5]], %[[V6]] +define void @add_with_constant_i32(ptr noalias %A, ptr noalias %B, ptr noalias %C, ptr noalias %D, ptr noalias %E, ptr noalias %F, ptr noalias %G, ptr noalias %H, ptr noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds i32, ptr %B, i64 %indvars.iv + %0 = load i32, ptr %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds i32, ptr %C, i64 %indvars.iv + %1 = load i32, ptr %arrayidx.2, align 4 + %arrayidx.3 = getelementptr inbounds i32, ptr %D, i64 %indvars.iv + %2 = load i32, ptr %arrayidx.3, align 4 + %arrayidx.4 = getelementptr inbounds i32, ptr %E, i64 %indvars.iv + %3 = load i32, ptr %arrayidx.4, align 4 + %arrayidx.5 = getelementptr inbounds i32, ptr %F, i64 %indvars.iv + %4 = load i32, ptr %arrayidx.5, align 4 + %arrayidx.6 = getelementptr inbounds i32, ptr %G, i64 %indvars.iv + %5 = load i32, ptr %arrayidx.6, align 4 + %arrayidx.7 = getelementptr inbounds i32, ptr %H, i64 %indvars.iv + %6 = load i32, ptr %arrayidx.7, align 4 + %arrayidx.8 = getelementptr inbounds i32, ptr %I, i64 %indvars.iv + %7 = load i32, ptr %arrayidx.8, align 4 + %8 = add i32 %0, 10 + %9 = add i32 %8, %1 + %10 = add i32 %9, %2 + %11 = add i32 %10, %3 + %12 = add i32 %11, %4 + %13 = add i32 %12, %5 + %14 = add i32 %13, %6 + %15 = add i32 %14, %7 + %arrayidx.9 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv + store i32 %15, ptr %arrayidx.9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +; CHECK-LABEL: @add_with_constant_i64( +; CHECK: %[[V0:.*]] = add i64 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V1:.*]] = add i64 {{.*}}, %[[V0]] +; CHECK-NEXT: %[[V2:.*]] = add i64 {{.*}}, 10 +; CHECK-NEXT: %[[V3:.*]] = add i64 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V4:.*]] = add i64 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V5:.*]] = add i64 %[[V1]], %[[V2]] +; CHECK-NEXT: %[[V6:.*]] = add i64 %[[V3]], %[[V4]] +; CHECK-NEXT: add i64 %[[V5]], %[[V6]] +define void @add_with_constant_i64(ptr noalias %A, ptr noalias %B, ptr noalias %C, ptr noalias %D, ptr noalias %E, ptr noalias %F, ptr noalias %G, ptr noalias %H, ptr noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds i64, ptr %B, i64 %indvars.iv + %0 = load i64, ptr %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds i64, ptr %C, i64 %indvars.iv + %1 = load i64, ptr %arrayidx.2, align 4 + %arrayidx.3 = getelementptr inbounds i64, ptr %D, i64 %indvars.iv + %2 = load i64, ptr %arrayidx.3, align 4 + %arrayidx.4 = getelementptr inbounds i64, ptr %E, i64 %indvars.iv + %3 = load i64, ptr %arrayidx.4, align 4 + %arrayidx.5 = getelementptr inbounds i64, ptr %F, i64 %indvars.iv + %4 = load i64, ptr %arrayidx.5, align 4 + %arrayidx.6 = getelementptr inbounds i64, ptr %G, i64 %indvars.iv + %5 = load i64, ptr %arrayidx.6, align 4 + %arrayidx.7 = getelementptr inbounds i64, ptr %H, i64 %indvars.iv + %6 = load i64, ptr %arrayidx.7, align 4 + %arrayidx.8 = getelementptr inbounds i64, ptr %I, i64 %indvars.iv + %7 = load i64, ptr %arrayidx.8, align 4 + %8 = add i64 %0, 10 + %9 = add i64 %8, %1 + %10 = add i64 %9, %2 + %11 = add i64 %10, %3 + %12 = add i64 %11, %4 + %13 = add i64 %12, %5 + %14 = add i64 %13, %6 + %15 = add i64 %14, %7 + %arrayidx.9 = getelementptr inbounds i64, ptr %A, i64 %indvars.iv + store i64 %15, ptr %arrayidx.9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} Index: llvm/test/Transforms/TreeHeightReduction/integer-mult-only.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/TreeHeightReduction/integer-mult-only.ll @@ -0,0 +1,209 @@ +; RUN: opt -S -passes=tree-height-reduction -enable-int-thr < %s | FileCheck %s + +; CHECK-LABEL: @mul_i8( +; CHECK: %[[V0:.*]] = mul i8 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V1:.*]] = mul i8 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V2:.*]] = mul i8 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V3:.*]] = mul i8 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V4:.*]] = mul i8 %[[V0]], %[[V1]] +; CHECK-NEXT: %[[V5:.*]] = mul i8 %[[V2]], %[[V3]] +; CHECK-NEXT: mul i8 %[[V4]], %[[V5]] +define void @mul_i8(ptr noalias %A, ptr noalias %B, ptr noalias %C, ptr noalias %D, ptr noalias %E, ptr noalias %F, ptr noalias %G, ptr noalias %H, ptr noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds i8, ptr %B, i64 %indvars.iv + %0 = load i8, ptr %arrayidx.1, align 1 + %arrayidx.2 = getelementptr inbounds i8, ptr %C, i64 %indvars.iv + %1 = load i8, ptr %arrayidx.2, align 1 + %2 = mul i8 %1, %0 + %arrayidx.3 = getelementptr inbounds i8, ptr %D, i64 %indvars.iv + %3 = load i8, ptr %arrayidx.3, align 1 + %4 = mul i8 %2, %3 + %arrayidx.4 = getelementptr inbounds i8, ptr %E, i64 %indvars.iv + %5 = load i8, ptr %arrayidx.4, align 1 + %6 = mul i8 %4, %5 + %arrayidx.5 = getelementptr inbounds i8, ptr %F, i64 %indvars.iv + %7 = load i8, ptr %arrayidx.5, align 1 + %8 = mul i8 %6, %7 + %arrayidx.6 = getelementptr inbounds i8, ptr %G, i64 %indvars.iv + %9 = load i8, ptr %arrayidx.6, align 1 + %10 = mul i8 %8, %9 + %arrayidx.7 = getelementptr inbounds i8, ptr %H, i64 %indvars.iv + %11 = load i8, ptr %arrayidx.7, align 1 + %12 = mul i8 %10, %11 + %arrayidx.8 = getelementptr inbounds i8, ptr %I, i64 %indvars.iv + %13 = load i8, ptr %arrayidx.8, align 1 + %14 = mul i8 %12, %13 + %arrayidx.9 = getelementptr inbounds i8, ptr %A, i64 %indvars.iv + store i8 %14, ptr %arrayidx.9, align 1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +; CHECK-LABEL: @mul_i16( +; CHECK: %[[V0:.*]] = mul i16 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V1:.*]] = mul i16 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V2:.*]] = mul i16 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V3:.*]] = mul i16 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V4:.*]] = mul i16 %[[V0]], %[[V1]] +; CHECK-NEXT: %[[V5:.*]] = mul i16 %[[V2]], %[[V3]] +; CHECK-NEXT: mul i16 %[[V4]], %[[V5]] +define void @mul_i16(ptr noalias %A, ptr noalias %B, ptr noalias %C, ptr noalias %D, ptr noalias %E, ptr noalias %F, ptr noalias %G, ptr noalias %H, ptr noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds i16, ptr %B, i64 %indvars.iv + %0 = load i16, ptr %arrayidx.1, align 1 + %arrayidx.2 = getelementptr inbounds i16, ptr %C, i64 %indvars.iv + %1 = load i16, ptr %arrayidx.2, align 1 + %2 = mul i16 %1, %0 + %arrayidx.3 = getelementptr inbounds i16, ptr %D, i64 %indvars.iv + %3 = load i16, ptr %arrayidx.3, align 1 + %4 = mul i16 %2, %3 + %arrayidx.4 = getelementptr inbounds i16, ptr %E, i64 %indvars.iv + %5 = load i16, ptr %arrayidx.4, align 1 + %6 = mul i16 %4, %5 + %arrayidx.5 = getelementptr inbounds i16, ptr %F, i64 %indvars.iv + %7 = load i16, ptr %arrayidx.5, align 1 + %8 = mul i16 %6, %7 + %arrayidx.6 = getelementptr inbounds i16, ptr %G, i64 %indvars.iv + %9 = load i16, ptr %arrayidx.6, align 1 + %10 = mul i16 %8, %9 + %arrayidx.7 = getelementptr inbounds i16, ptr %H, i64 %indvars.iv + %11 = load i16, ptr %arrayidx.7, align 1 + %12 = mul i16 %10, %11 + %arrayidx.8 = getelementptr inbounds i16, ptr %I, i64 %indvars.iv + %13 = load i16, ptr %arrayidx.8, align 1 + %14 = mul i16 %12, %13 + %arrayidx.9 = getelementptr inbounds i16, ptr %A, i64 %indvars.iv + store i16 %14, ptr %arrayidx.9, align 1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +; CHECK-LABEL: @mul_i32( +; CHECK: %[[V0:.*]] = mul i32 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V1:.*]] = mul i32 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V2:.*]] = mul i32 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V3:.*]] = mul i32 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V4:.*]] = mul i32 %[[V0]], %[[V1]] +; CHECK-NEXT: %[[V5:.*]] = mul i32 %[[V2]], %[[V3]] +; CHECK-NEXT: mul i32 %[[V4]], %[[V5]] +define void @mul_i32(ptr noalias %A, ptr noalias %B, ptr noalias %C, ptr noalias %D, ptr noalias %E, ptr noalias %F, ptr noalias %G, ptr noalias %H, ptr noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds i32, ptr %B, i64 %indvars.iv + %0 = load i32, ptr %arrayidx.1, align 1 + %arrayidx.2 = getelementptr inbounds i32, ptr %C, i64 %indvars.iv + %1 = load i32, ptr %arrayidx.2, align 1 + %2 = mul i32 %1, %0 + %arrayidx.3 = getelementptr inbounds i32, ptr %D, i64 %indvars.iv + %3 = load i32, ptr %arrayidx.3, align 1 + %4 = mul i32 %2, %3 + %arrayidx.4 = getelementptr inbounds i32, ptr %E, i64 %indvars.iv + %5 = load i32, ptr %arrayidx.4, align 1 + %6 = mul i32 %4, %5 + %arrayidx.5 = getelementptr inbounds i32, ptr %F, i64 %indvars.iv + %7 = load i32, ptr %arrayidx.5, align 1 + %8 = mul i32 %6, %7 + %arrayidx.6 = getelementptr inbounds i32, ptr %G, i64 %indvars.iv + %9 = load i32, ptr %arrayidx.6, align 1 + %10 = mul i32 %8, %9 + %arrayidx.7 = getelementptr inbounds i32, ptr %H, i64 %indvars.iv + %11 = load i32, ptr %arrayidx.7, align 1 + %12 = mul i32 %10, %11 + %arrayidx.8 = getelementptr inbounds i32, ptr %I, i64 %indvars.iv + %13 = load i32, ptr %arrayidx.8, align 1 + %14 = mul i32 %12, %13 + %arrayidx.9 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv + store i32 %14, ptr %arrayidx.9, align 1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +; CHECK-LABEL: @mul_i64( +; CHECK: %[[V0:.*]] = mul i64 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V1:.*]] = mul i64 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V2:.*]] = mul i64 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V3:.*]] = mul i64 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V4:.*]] = mul i64 %[[V0]], %[[V1]] +; CHECK-NEXT: %[[V5:.*]] = mul i64 %[[V2]], %[[V3]] +; CHECK-NEXT: mul i64 %[[V4]], %[[V5]] +define void @mul_i64(ptr noalias %A, ptr noalias %B, ptr noalias %C, ptr noalias %D, ptr noalias %E, ptr noalias %F, ptr noalias %G, ptr noalias %H, ptr noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds i64, ptr %B, i64 %indvars.iv + %0 = load i64, ptr %arrayidx.1, align 1 + %arrayidx.2 = getelementptr inbounds i64, ptr %C, i64 %indvars.iv + %1 = load i64, ptr %arrayidx.2, align 1 + %2 = mul i64 %1, %0 + %arrayidx.3 = getelementptr inbounds i64, ptr %D, i64 %indvars.iv + %3 = load i64, ptr %arrayidx.3, align 1 + %4 = mul i64 %2, %3 + %arrayidx.4 = getelementptr inbounds i64, ptr %E, i64 %indvars.iv + %5 = load i64, ptr %arrayidx.4, align 1 + %6 = mul i64 %4, %5 + %arrayidx.5 = getelementptr inbounds i64, ptr %F, i64 %indvars.iv + %7 = load i64, ptr %arrayidx.5, align 1 + %8 = mul i64 %6, %7 + %arrayidx.6 = getelementptr inbounds i64, ptr %G, i64 %indvars.iv + %9 = load i64, ptr %arrayidx.6, align 1 + %10 = mul i64 %8, %9 + %arrayidx.7 = getelementptr inbounds i64, ptr %H, i64 %indvars.iv + %11 = load i64, ptr %arrayidx.7, align 1 + %12 = mul i64 %10, %11 + %arrayidx.8 = getelementptr inbounds i64, ptr %I, i64 %indvars.iv + %13 = load i64, ptr %arrayidx.8, align 1 + %14 = mul i64 %12, %13 + %arrayidx.9 = getelementptr inbounds i64, ptr %A, i64 %indvars.iv + store i64 %14, ptr %arrayidx.9, align 1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} Index: llvm/test/Transforms/TreeHeightReduction/integer-sub-only.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/TreeHeightReduction/integer-sub-only.ll @@ -0,0 +1,209 @@ +; RUN: opt -S -passes=tree-height-reduction -enable-int-thr < %s | FileCheck %s + +; CHECK-LABEL: @sub_i8( +; CHECK: %[[V0:.*]] = sub i8 {{.*}}, {{.*}} +; CHECK: %[[V1:.*]] = sub i8 %[[V0]], %{{.*}} +; CHECK: %[[V2:.*]] = sub i8 %[[V1]], {{.*}} +; CHECK: %[[V3:.*]] = sub i8 %[[V2]], {{.*}} +; CHECK: %[[V4:.*]] = sub i8 %[[V3]], {{.*}} +; CHECK: %[[V5:.*]] = sub i8 %[[V4]], {{.*}} +; CHECK: sub i8 %[[V5]], {{.*}} +define void @sub_i8(ptr noalias %A, ptr noalias %B, ptr noalias %C, ptr noalias %D, ptr noalias %E, ptr noalias %F, ptr noalias %G, ptr noalias %H, ptr noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds i8, ptr %B, i64 %indvars.iv + %0 = load i8, ptr %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds i8, ptr %C, i64 %indvars.iv + %1 = load i8, ptr %arrayidx.2, align 4 + %2 = sub i8 %0, %1 + %arrayidx.3 = getelementptr inbounds i8, ptr %D, i64 %indvars.iv + %3 = load i8, ptr %arrayidx.3, align 4 + %4 = sub i8 %2, %3 + %arrayidx.4 = getelementptr inbounds i8, ptr %E, i64 %indvars.iv + %5 = load i8, ptr %arrayidx.4, align 4 + %6 = sub i8 %4, %5 + %arrayidx.5 = getelementptr inbounds i8, ptr %F, i64 %indvars.iv + %7 = load i8, ptr %arrayidx.5, align 4 + %8 = sub i8 %6, %7 + %arrayidx.6 = getelementptr inbounds i8, ptr %G, i64 %indvars.iv + %9 = load i8, ptr %arrayidx.6, align 4 + %10 = sub i8 %8, %9 + %arrayidx.7 = getelementptr inbounds i8, ptr %H, i64 %indvars.iv + %11 = load i8, ptr %arrayidx.7, align 4 + %12 = sub i8 %10, %11 + %arrayidx.8 = getelementptr inbounds i8, ptr %I, i64 %indvars.iv + %13 = load i8, ptr %arrayidx.8, align 4 + %14 = sub i8 %12, %13 + %arrayidx.9 = getelementptr inbounds i8, ptr %A, i64 %indvars.iv + store i8 %14, ptr %arrayidx.9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +; CHECK-LABEL: @sub_i16( +; CHECK: %[[V0:.*]] = sub i16 {{.*}}, {{.*}} +; CHECK: %[[V1:.*]] = sub i16 %[[V0]], %{{.*}} +; CHECK: %[[V2:.*]] = sub i16 %[[V1]], {{.*}} +; CHECK: %[[V3:.*]] = sub i16 %[[V2]], {{.*}} +; CHECK: %[[V4:.*]] = sub i16 %[[V3]], {{.*}} +; CHECK: %[[V5:.*]] = sub i16 %[[V4]], {{.*}} +; CHECK: sub i16 %[[V5]], {{.*}} +define void @sub_i16(ptr noalias %A, ptr noalias %B, ptr noalias %C, ptr noalias %D, ptr noalias %E, ptr noalias %F, ptr noalias %G, ptr noalias %H, ptr noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds i16, ptr %B, i64 %indvars.iv + %0 = load i16, ptr %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds i16, ptr %C, i64 %indvars.iv + %1 = load i16, ptr %arrayidx.2, align 4 + %2 = sub i16 %0, %1 + %arrayidx.3 = getelementptr inbounds i16, ptr %D, i64 %indvars.iv + %3 = load i16, ptr %arrayidx.3, align 4 + %4 = sub i16 %2, %3 + %arrayidx.4 = getelementptr inbounds i16, ptr %E, i64 %indvars.iv + %5 = load i16, ptr %arrayidx.4, align 4 + %6 = sub i16 %4, %5 + %arrayidx.5 = getelementptr inbounds i16, ptr %F, i64 %indvars.iv + %7 = load i16, ptr %arrayidx.5, align 4 + %8 = sub i16 %6, %7 + %arrayidx.6 = getelementptr inbounds i16, ptr %G, i64 %indvars.iv + %9 = load i16, ptr %arrayidx.6, align 4 + %10 = sub i16 %8, %9 + %arrayidx.7 = getelementptr inbounds i16, ptr %H, i64 %indvars.iv + %11 = load i16, ptr %arrayidx.7, align 4 + %12 = sub i16 %10, %11 + %arrayidx.8 = getelementptr inbounds i16, ptr %I, i64 %indvars.iv + %13 = load i16, ptr %arrayidx.8, align 4 + %14 = sub i16 %12, %13 + %arrayidx.9 = getelementptr inbounds i16, ptr %A, i64 %indvars.iv + store i16 %14, ptr %arrayidx.9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +; CHECK-LABEL: @sub_i32( +; CHECK: %[[V0:.*]] = sub i32 {{.*}}, {{.*}} +; CHECK: %[[V1:.*]] = sub i32 %[[V0]], %{{.*}} +; CHECK: %[[V2:.*]] = sub i32 %[[V1]], {{.*}} +; CHECK: %[[V3:.*]] = sub i32 %[[V2]], {{.*}} +; CHECK: %[[V4:.*]] = sub i32 %[[V3]], {{.*}} +; CHECK: %[[V5:.*]] = sub i32 %[[V4]], {{.*}} +; CHECK: sub i32 %[[V5]], {{.*}} +define void @sub_i32(ptr noalias %A, ptr noalias %B, ptr noalias %C, ptr noalias %D, ptr noalias %E, ptr noalias %F, ptr noalias %G, ptr noalias %H, ptr noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds i32, ptr %B, i64 %indvars.iv + %0 = load i32, ptr %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds i32, ptr %C, i64 %indvars.iv + %1 = load i32, ptr %arrayidx.2, align 4 + %2 = sub i32 %0, %1 + %arrayidx.3 = getelementptr inbounds i32, ptr %D, i64 %indvars.iv + %3 = load i32, ptr %arrayidx.3, align 4 + %4 = sub i32 %2, %3 + %arrayidx.4 = getelementptr inbounds i32, ptr %E, i64 %indvars.iv + %5 = load i32, ptr %arrayidx.4, align 4 + %6 = sub i32 %4, %5 + %arrayidx.5 = getelementptr inbounds i32, ptr %F, i64 %indvars.iv + %7 = load i32, ptr %arrayidx.5, align 4 + %8 = sub i32 %6, %7 + %arrayidx.6 = getelementptr inbounds i32, ptr %G, i64 %indvars.iv + %9 = load i32, ptr %arrayidx.6, align 4 + %10 = sub i32 %8, %9 + %arrayidx.7 = getelementptr inbounds i32, ptr %H, i64 %indvars.iv + %11 = load i32, ptr %arrayidx.7, align 4 + %12 = sub i32 %10, %11 + %arrayidx.8 = getelementptr inbounds i32, ptr %I, i64 %indvars.iv + %13 = load i32, ptr %arrayidx.8, align 4 + %14 = sub i32 %12, %13 + %arrayidx.9 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv + store i32 %14, ptr %arrayidx.9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +; CHECK-LABEL: @sub_i64( +; CHECK: %[[V0:.*]] = sub i64 {{.*}}, {{.*}} +; CHECK: %[[V1:.*]] = sub i64 %[[V0]], %{{.*}} +; CHECK: %[[V2:.*]] = sub i64 %[[V1]], {{.*}} +; CHECK: %[[V3:.*]] = sub i64 %[[V2]], {{.*}} +; CHECK: %[[V4:.*]] = sub i64 %[[V3]], {{.*}} +; CHECK: %[[V5:.*]] = sub i64 %[[V4]], {{.*}} +; CHECK: sub i64 %[[V5]], {{.*}} +define void @sub_i64(ptr noalias %A, ptr noalias %B, ptr noalias %C, ptr noalias %D, ptr noalias %E, ptr noalias %F, ptr noalias %G, ptr noalias %H, ptr noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds i64, ptr %B, i64 %indvars.iv + %0 = load i64, ptr %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds i64, ptr %C, i64 %indvars.iv + %1 = load i64, ptr %arrayidx.2, align 4 + %2 = sub i64 %0, %1 + %arrayidx.3 = getelementptr inbounds i64, ptr %D, i64 %indvars.iv + %3 = load i64, ptr %arrayidx.3, align 4 + %4 = sub i64 %2, %3 + %arrayidx.4 = getelementptr inbounds i64, ptr %E, i64 %indvars.iv + %5 = load i64, ptr %arrayidx.4, align 4 + %6 = sub i64 %4, %5 + %arrayidx.5 = getelementptr inbounds i64, ptr %F, i64 %indvars.iv + %7 = load i64, ptr %arrayidx.5, align 4 + %8 = sub i64 %6, %7 + %arrayidx.6 = getelementptr inbounds i64, ptr %G, i64 %indvars.iv + %9 = load i64, ptr %arrayidx.6, align 4 + %10 = sub i64 %8, %9 + %arrayidx.7 = getelementptr inbounds i64, ptr %H, i64 %indvars.iv + %11 = load i64, ptr %arrayidx.7, align 4 + %12 = sub i64 %10, %11 + %arrayidx.8 = getelementptr inbounds i64, ptr %I, i64 %indvars.iv + %13 = load i64, ptr %arrayidx.8, align 4 + %14 = sub i64 %12, %13 + %arrayidx.9 = getelementptr inbounds i64, ptr %A, i64 %indvars.iv + store i64 %14, ptr %arrayidx.9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} Index: llvm/test/Transforms/TreeHeightReduction/leaf-num-check.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/TreeHeightReduction/leaf-num-check.ll @@ -0,0 +1,69 @@ +; RUN: opt -S -passes=tree-height-reduction -enable-int-thr < %s | FileCheck %s + +; CHECK-LABEL: @leaf_num_is_3( +; CHECK: %[[V0:.*]] = add nsw i32 {{.*}}, {{.*}} +; CHECK: add nsw i32 %[[V0]], {{.*}} +define void @leaf_num_is_3(ptr noalias %A, ptr noalias %B, ptr noalias %C, ptr noalias %D, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds i32, ptr %B, i64 %indvars.iv + %0 = load i32, ptr %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds i32, ptr %C, i64 %indvars.iv + %1 = load i32, ptr %arrayidx.2, align 4 + %2 = add nsw i32 %1, %0 + %arrayidx.3 = getelementptr inbounds i32, ptr %D, i64 %indvars.iv + %3 = load i32, ptr %arrayidx.3, align 4 + %4 = add nsw i32 %2, %3 + %arrayidx.4 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv + store i32 %4, ptr %arrayidx.1, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +; CHECK-LABEL: @leaf_num_is_4( +; CHECK: %[[V0:.*]] = add nsw i32 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V1:.*]] = add nsw i32 {{.*}}, {{.*}} +; CHECK-NEXT: add nsw i32 %[[V0]], %[[V1]] +define void @leaf_num_is_4(ptr noalias %A, ptr noalias %B, ptr noalias %C, ptr noalias %D, ptr noalias %E, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds i32, ptr %B, i64 %indvars.iv + %0 = load i32, ptr %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds i32, ptr %C, i64 %indvars.iv + %1 = load i32, ptr %arrayidx.2, align 4 + %2 = add nsw i32 %1, %0 + %arrayidx.3 = getelementptr inbounds i32, ptr %D, i64 %indvars.iv + %3 = load i32, ptr %arrayidx.3, align 4 + %4 = add nsw i32 %2, %3 + %arrayidx.4 = getelementptr inbounds i32, ptr %E, i64 %indvars.iv + %5 = load i32, ptr %arrayidx.4, align 4 + %6 = add nsw i32 %4, %5 + %arrayidx.5 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv + store i32 %6, ptr %arrayidx.5, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} Index: llvm/test/Transforms/TreeHeightReduction/long-double-add-with-constant.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/TreeHeightReduction/long-double-add-with-constant.ll @@ -0,0 +1,55 @@ +; RUN: opt -S -passes=tree-height-reduction -enable-fp-thr < %s | FileCheck %s + +; CHECK-LABEL: @add_fp128_with_constant( +; CHECK: %[[V0:.*]] = fadd reassoc nsz fp128 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V1:.*]] = fadd reassoc nsz fp128 {{.*}}, %[[V0]] +; CHECK-NEXT: %[[V2:.*]] = fadd reassoc nsz fp128 {{.*}}, 0xL00000000000000004002400000000000 +; CHECK-NEXT: %[[V3:.*]] = fadd reassoc nsz fp128 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V4:.*]] = fadd reassoc nsz fp128 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V5:.*]] = fadd reassoc nsz fp128 %[[V1]], %[[V2]] +; CHECK-NEXT: %[[V6:.*]] = fadd reassoc nsz fp128 %[[V3]], %[[V4]] +; CHECK-NEXT: fadd reassoc nsz fp128 %[[V5]], %[[V6]] +define void @add_fp128_with_constant(ptr noalias %A, ptr noalias %B, ptr noalias %C, ptr noalias %D, ptr noalias %E, ptr noalias %F, ptr noalias %G, ptr noalias %H, ptr noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds fp128, ptr %B, i64 %indvars.iv + %0 = load fp128, ptr %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds fp128, ptr %C, i64 %indvars.iv + %1 = load fp128, ptr %arrayidx.2, align 4 + %arrayidx.3 = getelementptr inbounds fp128, ptr %D, i64 %indvars.iv + %2 = load fp128, ptr %arrayidx.3, align 4 + %arrayidx.4 = getelementptr inbounds fp128, ptr %E, i64 %indvars.iv + %3 = load fp128, ptr %arrayidx.4, align 4 + %arrayidx.5 = getelementptr inbounds fp128, ptr %F, i64 %indvars.iv + %4 = load fp128, ptr %arrayidx.5, align 4 + %arrayidx.6 = getelementptr inbounds fp128, ptr %G, i64 %indvars.iv + %5 = load fp128, ptr %arrayidx.6, align 4 + %arrayidx.7 = getelementptr inbounds fp128, ptr %H, i64 %indvars.iv + %6 = load fp128, ptr %arrayidx.7, align 4 + %arrayidx.8 = getelementptr inbounds fp128, ptr %I, i64 %indvars.iv + %7 = load fp128, ptr %arrayidx.8, align 4 + %8 = fadd reassoc nsz fp128 %0, 0xL00000000000000004002400000000000 + %9 = fadd reassoc nsz fp128 %8, %1 + %10 = fadd reassoc nsz fp128 %9, %2 + %11 = fadd reassoc nsz fp128 %10, %3 + %12 = fadd reassoc nsz fp128 %11, %4 + %13 = fadd reassoc nsz fp128 %12, %5 + %14 = fadd reassoc nsz fp128 %13, %6 + %15 = fadd reassoc nsz fp128 %14, %7 + %arrayidx.9 = getelementptr inbounds fp128, ptr %A, i64 %indvars.iv + store fp128 %15, ptr %arrayidx.9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} Index: llvm/test/Transforms/TreeHeightReduction/long-double-add.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/TreeHeightReduction/long-double-add.ll @@ -0,0 +1,53 @@ +; RUN: opt -S -passes=tree-height-reduction -enable-fp-thr < %s | FileCheck %s + +; CHECK-LABEL: @add_fp128( +; CHECK: %[[V0:.*]] = fadd reassoc nsz fp128 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V1:.*]] = fadd reassoc nsz fp128 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V2:.*]] = fadd reassoc nsz fp128 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V3:.*]] = fadd reassoc nsz fp128 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V4:.*]] = fadd reassoc nsz fp128 %[[V0]], %[[V1]] +; CHECK-NEXT: %[[V5:.*]] = fadd reassoc nsz fp128 %[[V2]], %[[V3]] +; CHECK-NEXT: fadd reassoc nsz fp128 %[[V4]], %[[V5]] +define void @add_fp128(ptr noalias %A, ptr noalias %B, ptr noalias %C, ptr noalias %D, ptr noalias %E, ptr noalias %F, ptr noalias %G, ptr noalias %H, ptr noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds fp128, ptr %B, i64 %indvars.iv + %0 = load fp128, ptr %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds fp128, ptr %C, i64 %indvars.iv + %1 = load fp128, ptr %arrayidx.2, align 4 + %2 = fadd reassoc nsz fp128 %1, %0 + %arrayidx.3 = getelementptr inbounds fp128, ptr %D, i64 %indvars.iv + %3 = load fp128, ptr %arrayidx.3, align 4 + %4 = fadd reassoc nsz fp128 %2, %3 + %arrayidx.4 = getelementptr inbounds fp128, ptr %E, i64 %indvars.iv + %5 = load fp128, ptr %arrayidx.4, align 4 + %6 = fadd reassoc nsz fp128 %4, %5 + %arrayidx.5 = getelementptr inbounds fp128, ptr %F, i64 %indvars.iv + %7 = load fp128, ptr %arrayidx.5, align 4 + %8 = fadd reassoc nsz fp128 %6, %7 + %arrayidx.6 = getelementptr inbounds fp128, ptr %G, i64 %indvars.iv + %9 = load fp128, ptr %arrayidx.6, align 4 + %10 = fadd reassoc nsz fp128 %8, %9 + %arrayidx.7 = getelementptr inbounds fp128, ptr %H, i64 %indvars.iv + %11 = load fp128, ptr %arrayidx.7, align 4 + %12 = fadd reassoc nsz fp128 %10, %11 + %arrayidx.8 = getelementptr inbounds fp128, ptr %I, i64 %indvars.iv + %13 = load fp128, ptr %arrayidx.8, align 4 + %14 = fadd reassoc nsz fp128 %12, %13 + %arrayidx.9 = getelementptr inbounds fp128, ptr %A, i64 %indvars.iv + store fp128 %14, ptr %arrayidx.9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} Index: llvm/test/Transforms/TreeHeightReduction/long-double-mult.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/TreeHeightReduction/long-double-mult.ll @@ -0,0 +1,53 @@ +; RUN: opt -S -passes=tree-height-reduction -enable-fp-thr < %s | FileCheck %s + +; CHECK-LABEL: @add_fp128( +; CHECK: %[[V0:.*]] = fmul reassoc nsz fp128 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V1:.*]] = fmul reassoc nsz fp128 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V2:.*]] = fmul reassoc nsz fp128 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V3:.*]] = fmul reassoc nsz fp128 {{.*}}, {{.*}} +; CHECK-NEXT: %[[V4:.*]] = fmul reassoc nsz fp128 %[[V0]], %[[V1]] +; CHECK-NEXT: %[[V5:.*]] = fmul reassoc nsz fp128 %[[V2]], %[[V3]] +; CHECK-NEXT: fmul reassoc nsz fp128 %[[V4]], %[[V5]] +define void @add_fp128(ptr noalias %A, ptr noalias %B, ptr noalias %C, ptr noalias %D, ptr noalias %E, ptr noalias %F, ptr noalias %G, ptr noalias %H, ptr noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds fp128, ptr %B, i64 %indvars.iv + %0 = load fp128, ptr %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds fp128, ptr %C, i64 %indvars.iv + %1 = load fp128, ptr %arrayidx.2, align 4 + %2 = fmul reassoc nsz fp128 %1, %0 + %arrayidx.3 = getelementptr inbounds fp128, ptr %D, i64 %indvars.iv + %3 = load fp128, ptr %arrayidx.3, align 4 + %4 = fmul reassoc nsz fp128 %2, %3 + %arrayidx.4 = getelementptr inbounds fp128, ptr %E, i64 %indvars.iv + %5 = load fp128, ptr %arrayidx.4, align 4 + %6 = fmul reassoc nsz fp128 %4, %5 + %arrayidx.5 = getelementptr inbounds fp128, ptr %F, i64 %indvars.iv + %7 = load fp128, ptr %arrayidx.5, align 4 + %8 = fmul reassoc nsz fp128 %6, %7 + %arrayidx.6 = getelementptr inbounds fp128, ptr %G, i64 %indvars.iv + %9 = load fp128, ptr %arrayidx.6, align 4 + %10 = fmul reassoc nsz fp128 %8, %9 + %arrayidx.7 = getelementptr inbounds fp128, ptr %H, i64 %indvars.iv + %11 = load fp128, ptr %arrayidx.7, align 4 + %12 = fmul reassoc nsz fp128 %10, %11 + %arrayidx.8 = getelementptr inbounds fp128, ptr %I, i64 %indvars.iv + %13 = load fp128, ptr %arrayidx.8, align 4 + %14 = fmul reassoc nsz fp128 %12, %13 + %arrayidx.9 = getelementptr inbounds fp128, ptr %A, i64 %indvars.iv + store fp128 %14, ptr %arrayidx.9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} Index: llvm/test/Transforms/TreeHeightReduction/long-double-sub.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/TreeHeightReduction/long-double-sub.ll @@ -0,0 +1,105 @@ +; RUN: opt -S -passes=tree-height-reduction -enable-fp-thr < %s | FileCheck %s + +; CHECK-LABEL: @sub_fp128( +; CHECK: %[[V0:.*]] = fsub reassoc nsz fp128 {{.*}}, {{.*}} +; CHECK: %[[V1:.*]] = fsub reassoc nsz fp128 %[[V0]], %{{.*}} +; CHECK: %[[V2:.*]] = fsub reassoc nsz fp128 %[[V1]], {{.*}} +; CHECK: %[[V3:.*]] = fsub reassoc nsz fp128 %[[V2]], {{.*}} +; CHECK: %[[V4:.*]] = fsub reassoc nsz fp128 %[[V3]], {{.*}} +; CHECK: %[[V5:.*]] = fsub reassoc nsz fp128 %[[V4]], {{.*}} +; CHECK: fsub reassoc nsz fp128 %[[V5]], {{.*}} +define void @sub_fp128(ptr noalias %A, ptr noalias %B, ptr noalias %C, ptr noalias %D, ptr noalias %E, ptr noalias %F, ptr noalias %G, ptr noalias %H, ptr noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds fp128, ptr %B, i64 %indvars.iv + %0 = load fp128, ptr %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds fp128, ptr %C, i64 %indvars.iv + %1 = load fp128, ptr %arrayidx.2, align 4 + %2 = fsub reassoc nsz fp128 %0, %1 + %arrayidx.3 = getelementptr inbounds fp128, ptr %D, i64 %indvars.iv + %3 = load fp128, ptr %arrayidx.3, align 4 + %4 = fsub reassoc nsz fp128 %2, %3 + %arrayidx.4 = getelementptr inbounds fp128, ptr %E, i64 %indvars.iv + %5 = load fp128, ptr %arrayidx.4, align 4 + %6 = fsub reassoc nsz fp128 %4, %5 + %arrayidx.5 = getelementptr inbounds fp128, ptr %F, i64 %indvars.iv + %7 = load fp128, ptr %arrayidx.5, align 4 + %8 = fsub reassoc nsz fp128 %6, %7 + %arrayidx.6 = getelementptr inbounds fp128, ptr %G, i64 %indvars.iv + %9 = load fp128, ptr %arrayidx.6, align 4 + %10 = fsub reassoc nsz fp128 %8, %9 + %arrayidx.7 = getelementptr inbounds fp128, ptr %H, i64 %indvars.iv + %11 = load fp128, ptr %arrayidx.7, align 4 + %12 = fsub reassoc nsz fp128 %10, %11 + %arrayidx.8 = getelementptr inbounds fp128, ptr %I, i64 %indvars.iv + %13 = load fp128, ptr %arrayidx.8, align 4 + %14 = fsub reassoc nsz fp128 %12, %13 + %arrayidx.9 = getelementptr inbounds fp128, ptr %A, i64 %indvars.iv + store fp128 %14, ptr %arrayidx.9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +; CHECK-LABEL: @sub_double( +; CHECK: %[[V0:.*]] = fsub reassoc nsz double {{.*}}, {{.*}} +; CHECK: %[[V1:.*]] = fsub reassoc nsz double %[[V0]], %{{.*}} +; CHECK: %[[V2:.*]] = fsub reassoc nsz double %[[V1]], {{.*}} +; CHECK: %[[V3:.*]] = fsub reassoc nsz double %[[V2]], {{.*}} +; CHECK: %[[V4:.*]] = fsub reassoc nsz double %[[V3]], {{.*}} +; CHECK: %[[V5:.*]] = fsub reassoc nsz double %[[V4]], {{.*}} +; CHECK: fsub reassoc nsz double %[[V5]], {{.*}} +define void @sub_double(ptr noalias %A, ptr noalias %B, ptr noalias %C, ptr noalias %D, ptr noalias %E, ptr noalias %F, ptr noalias %G, ptr noalias %H, ptr noalias %I, i32 %N) norecurse nounwind { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %preh, label %for.end + +preh: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %preh + %indvars.iv = phi i64 [ 0, %preh ], [ %indvars.iv.next, %for.body ] + %arrayidx.1 = getelementptr inbounds double, ptr %B, i64 %indvars.iv + %0 = load double, ptr %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds double, ptr %C, i64 %indvars.iv + %1 = load double, ptr %arrayidx.2, align 4 + %2 = fsub reassoc nsz double %0, %1 + %arrayidx.3 = getelementptr inbounds double, ptr %D, i64 %indvars.iv + %3 = load double, ptr %arrayidx.3, align 4 + %4 = fsub reassoc nsz double %2, %3 + %arrayidx.4 = getelementptr inbounds double, ptr %E, i64 %indvars.iv + %5 = load double, ptr %arrayidx.4, align 4 + %6 = fsub reassoc nsz double %4, %5 + %arrayidx.5 = getelementptr inbounds double, ptr %F, i64 %indvars.iv + %7 = load double, ptr %arrayidx.5, align 4 + %8 = fsub reassoc nsz double %6, %7 + %arrayidx.6 = getelementptr inbounds double, ptr %G, i64 %indvars.iv + %9 = load double, ptr %arrayidx.6, align 4 + %10 = fsub reassoc nsz double %8, %9 + %arrayidx.7 = getelementptr inbounds double, ptr %H, i64 %indvars.iv + %11 = load double, ptr %arrayidx.7, align 4 + %12 = fsub reassoc nsz double %10, %11 + %arrayidx.8 = getelementptr inbounds double, ptr %I, i64 %indvars.iv + %13 = load double, ptr %arrayidx.8, align 4 + %14 = fsub reassoc nsz double %12, %13 + %arrayidx.9 = getelementptr inbounds double, ptr %A, i64 %indvars.iv + store double %14, ptr %arrayidx.9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +}