Index: docs/Passes.rst =================================================================== --- docs/Passes.rst +++ docs/Passes.rst @@ -641,6 +641,20 @@ :ref:`-functionattrs ` pass and LLVM's knowledge of library calls on different targets. +.. _passes-aggressive-instcombine: + +``-aggressive-instcombine``: Combine expression patterns +------------------------------------------------ + +Combine expression pattern to form expression with fewer, simple instructions. +This pass does not modify the CFG. + +For example, this pass reduce width of expression post-dominated by TruncInst +into smaller width when applicable. + +It differs from instcombine pass in that it contains pattern optimization that requires +higher complexity than the O(1), thus, it should run fewer times than instcombine pass. + ``-internalize``: Internalize Global Symbols -------------------------------------------- Index: include/llvm/InitializePasses.h =================================================================== --- include/llvm/InitializePasses.h +++ include/llvm/InitializePasses.h @@ -281,6 +281,7 @@ void initializePartialInlinerLegacyPassPass(PassRegistry&); void initializePartiallyInlineLibCallsLegacyPassPass(PassRegistry&); void initializePatchableFunctionPass(PassRegistry&); +void initializeAggressiveInstCombinerPass(PassRegistry&); void initializePeepholeOptimizerPass(PassRegistry&); void initializePhysicalRegisterUsageInfoPass(PassRegistry&); void initializePlaceBackedgeSafepointsImplPass(PassRegistry&); Index: include/llvm/Transforms/Scalar.h =================================================================== --- include/llvm/Transforms/Scalar.h +++ include/llvm/Transforms/Scalar.h @@ -134,6 +134,13 @@ //===----------------------------------------------------------------------===// // +// AggressiveInstCombiner - Combine expression pattern to form expression with +// fewer, simple instructions. This pass does not modify the CFG. +// +FunctionPass *createAggressiveInstCombinerPass(); + +//===----------------------------------------------------------------------===// +// // LICM - This pass is a loop invariant code motion and memory promotion pass. // Pass *createLICMPass(); Index: lib/LTO/LLVMBuild.txt =================================================================== --- lib/LTO/LLVMBuild.txt +++ lib/LTO/LLVMBuild.txt @@ -27,6 +27,7 @@ Core IPO InstCombine + AggressiveInstCombine Linker MC ObjCARC Index: lib/Passes/LLVMBuild.txt =================================================================== --- lib/Passes/LLVMBuild.txt +++ lib/Passes/LLVMBuild.txt @@ -19,4 +19,4 @@ type = Library name = Passes parent = Libraries -required_libraries = Analysis CodeGen Core IPO InstCombine Scalar Support TransformUtils Vectorize Instrumentation +required_libraries = Analysis CodeGen Core IPO InstCombine AggressiveInstCombine Scalar Support TransformUtils Vectorize Instrumentation Index: lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp =================================================================== --- lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp +++ lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp @@ -0,0 +1,85 @@ +//===- AggressiveInstCombine.cpp ------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the aggressive expression pattern combiner classes. +// Currently, it handles expression patterns for: +// * Truncate instruction +// +//===----------------------------------------------------------------------===// + +#include "AggressiveInstCombineInternal.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/BasicAliasAnalysis.h" +#include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/Pass.h" +#include "llvm/Transforms/Scalar.h" +using namespace llvm; + +#define DEBUG_TYPE "aggressive-instcombine" + +namespace { +/// Contains expression pattern combiner logic. +/// This class provides both the logic to combine expression patterns and +/// combine them. It differs from InstCombiner class in that it runs only once, +/// which allows pattern optimization to have higher complexity than the O(1) +/// required by the instruction combiner. +class AggressiveInstCombiner : public FunctionPass { +public: + static char ID; // Pass identification, replacement for typeid + + AggressiveInstCombiner() : FunctionPass(ID) { + initializeAggressiveInstCombinerPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override; + + /// Run all expression pattern optimizations on the given /p F function. + /// + /// \param F function to optimize. + /// \returns true if the IR is changed. + bool runOnFunction(Function &F) override; + +}; +} // namespace + +void AggressiveInstCombiner::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + AU.addRequired(); + AU.addPreserved(); + AU.addPreserved(); + AU.addPreserved(); +} + +bool AggressiveInstCombiner::runOnFunction(Function &F) { + auto &TLI = getAnalysis().getTLI(); + auto &DL = F.getParent()->getDataLayout(); + + bool MadeIRChange = false; + + // Handle TruncInst patterns + TruncInstCombine TIC(TLI, DL); + MadeIRChange |= TIC.run(F); + + // TODO: add more patterns to handle... + + return MadeIRChange; +} + +char AggressiveInstCombiner::ID = 0; +INITIALIZE_PASS_BEGIN(AggressiveInstCombiner, "aggressive-instcombine", + "Combine pattern based expressions", false, false) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_END(AggressiveInstCombiner, "aggressive-instcombine", + "Combine pattern based expressions", false, false) + +FunctionPass *llvm::createAggressiveInstCombinerPass() { + return new AggressiveInstCombiner(); +} Index: lib/Transforms/AggressiveInstCombine/AggressiveInstCombineInternal.h =================================================================== --- lib/Transforms/AggressiveInstCombine/AggressiveInstCombineInternal.h +++ lib/Transforms/AggressiveInstCombine/AggressiveInstCombineInternal.h @@ -0,0 +1,119 @@ +//===- AggressiveInstCombineInternal.h --------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the instruction pattern combiner classes. +// Currently, it handles pattern expressions for: +// * Truncate instruction +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/MapVector.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/BasicAliasAnalysis.h" +#include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/Pass.h" +#include "llvm/Transforms/Scalar.h" +using namespace llvm; + +//===----------------------------------------------------------------------===// +// TruncInstCombine - looks for expression dags dominated by trunc instructions +// and for each eligible dag, it will create a reduced bit-width expression and +// replace the old expression with this new one and remove the old one. +// Eligible expression dag is such that: +// 1. Contains only supported instructions. +// 2. Supported leaves: ZExtInst, SExtInst, TruncInst and Constant value. +// 3. Can be evaluated into type with reduced legal bit-width (or Trunc type). +// 4. All instructions in the dag must not have users outside the dag. +// Only exception is for {ZExt, SExt}Inst with operand type equal to the +// new reduced type chosen in (3). +// +// The motivation for this optimization is that evaluating and expression using +// smaller bit-width is preferable, especially for vectorization where we can +// fit more values in one vectorized instruction. In addition, this optimization +// may decrease the number of cast instructions, but will not increase it. +//===----------------------------------------------------------------------===// + +namespace llvm { + class DataLayout; + class TargetLibraryInfo; + +class TruncInstCombine { + TargetLibraryInfo &TLI; + const DataLayout &DL; + + /// List of all TruncInst instructions to be processed. + SmallVector Worklist; + + /// Current processed TruncInst instruction. + TruncInst *CurrentTruncInst; + + /// Information per each instruction in the expression dag. + struct Info { + /// Number of LSBs that are needed to generate a valid expression. + unsigned ValidBitWidth = 0; + /// Minimum number of LSBs needed to generate the ValidBitWidth. + unsigned MinBitWidth = 0; + /// The reduced value generated to replace the old instruction. + Value *NewValue = nullptr; + }; + /// An ordered map representing expression dag dominated by current processed + /// TruncInst. It maps each instruction in the dag to its Info structure. + /// The map is ordered such that each instruction appears before all other + /// instructions in the dag that uses it. + MapVector InstInfoMap; + +public: + TruncInstCombine(TargetLibraryInfo &TLI, const DataLayout &DL) + : TLI(TLI), DL(DL), CurrentTruncInst(nullptr) {} + + /// Perform TruncInst pattern optimization on given function. + bool run(Function &F); + +private: + /// Build expression dag dominated by the /p CurrentTruncInst and append it to + /// the InstInfoMap container. + /// + /// \return true only if succeed to generate an eligible sub expression dag. + bool buildTruncExpressionDag(); + + /// Calculate the minimum number of LSBs needed to generate the number of + /// BitWidth in the /p CurrentTruncInst destination. + /// + /// \return minimum number of LSBs needed to preserve /p CurrentTruncInst + /// destination BitWidth. + unsigned getMinBitWidth(); + + /// Build an expression dag dominated by the current processed TruncInst and + /// Check if it is eligible to be reduced to a smaller type. + /// + /// \return the scalar version of the new type to be used for the reduced + /// expression dag, or nullptr if the expression dag is not eligible + /// to be reduced. + Type *getBestTruncatedType(); + + /// Given a \p V value and a \p SclTy scalar type return the generated reduced + /// value of \p V based on the type \p SclTy. + /// + /// \param V value to be reduced. + /// \param SclTy scalar version of new type to reduce to. + /// \return the new reduced value. + Value *getReducedOperand(Value *V, Type *SclTy); + + /// Create a new expression dag using the reduced /p SclTy type and replace + /// the old expression dag with it. Also erase all instructions in the old + /// dag, except those that are still needed outside the dag. + /// + /// \param SclTy scalar version of new type to reduce expression dag into. + void ReduceExpressionDag(Type *SclTy); +}; +} // end namespace llvm. Index: lib/Transforms/AggressiveInstCombine/CMakeLists.txt =================================================================== --- lib/Transforms/AggressiveInstCombine/CMakeLists.txt +++ lib/Transforms/AggressiveInstCombine/CMakeLists.txt @@ -0,0 +1,11 @@ +add_llvm_library(LLVMAggressiveInstCombine + AggressiveInstCombine.cpp + TruncInstCombine.cpp + + ADDITIONAL_HEADER_DIRS + ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms + ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms/AggressiveInstCombine + + DEPENDS + intrinsics_gen + ) Index: lib/Transforms/AggressiveInstCombine/LLVMBuild.txt =================================================================== --- lib/Transforms/AggressiveInstCombine/LLVMBuild.txt +++ lib/Transforms/AggressiveInstCombine/LLVMBuild.txt @@ -0,0 +1,22 @@ +;===- ./lib/Transforms/AggressiveInstCombine/LLVMBuild.txt -----*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Library +name = AggressiveInstCombine +parent = Transforms +required_libraries = Analysis Core Support TransformUtils Index: lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp =================================================================== --- lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp +++ lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp @@ -0,0 +1,404 @@ +//===- TruncInstCombine.cpp -----------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TruncInstCombine - looks for expression dags post-dominated by TruncInst and +// for each eligible dag, it will create a reduced bit-width expression, replace +// the old expression with this new one and remove the old expression. +// Eligible expression dag is such that: +// 1. Contains only supported instructions. +// 2. Supported leaves: ZExtInst, SExtInst, TruncInst and Constant value. +// 3. Can be evaluated into type with reduced legal bit-width. +// 4. All instructions in the dag must not have users outside the dag. +// The only exception is for {ZExt, SExt}Inst with operand type equal to +// the new reduced type evaluated in (3). +// +// The motivation for this optimization is that evaluating and expression using +// smaller bit-width is preferable, especially for vectorization where we can +// fit more values in one vectorized instruction. In addition, this optimization +// may decrease the number of cast instructions, but will not increase it. +// +//===----------------------------------------------------------------------===// + +#include "AggressiveInstCombineInternal.h" +#include "llvm/ADT/MapVector.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/IR/DataLayout.h" +using namespace llvm; + +#define DEBUG_TYPE "aggressive-instcombine" + +/// \brief Inserts an instruction \p New before instruction \p Old and update +/// \p New instruction DebugLoc and Name with those of \p Old instruction. +static void InsertAndUpdateNewInstWith(Instruction &New, Instruction &Old) { + assert(!New.getParent() && + "New instruction already inserted into a basic block!"); + BasicBlock *BB = Old.getParent(); + BB->getInstList().insert(Old.getIterator(), &New); // Insert inst + New.setDebugLoc(Old.getDebugLoc()); + New.takeName(&Old); +} + +/// Given an instruction and a container, it fills all the relevant operands of +/// that instruction, with respect to the Trunc expression dag optimizaton. +static void getRelevantOperands(Instruction *I, SmallVectorImpl &Ops) { + unsigned Opc = I->getOpcode(); + switch (Opc) { + case Instruction::Trunc: + case Instruction::ZExt: + case Instruction::SExt: + break; + case Instruction::Add: + case Instruction::Sub: + case Instruction::Mul: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + Ops.push_back(I->getOperand(0)); + Ops.push_back(I->getOperand(1)); + break; + default: + llvm_unreachable("Unreachable!"); + } +} + +bool TruncInstCombine::buildTruncExpressionDag() { + SmallVector Worklist; + SmallVector Stack; + // Clear old expression dag. + InstInfoMap.clear(); + + Worklist.push_back(CurrentTruncInst->getOperand(0)); + + while (!Worklist.empty()) { + Value *Curr = Worklist.back(); + + if (isa(Curr)) { + Worklist.pop_back(); + continue; + } + + Instruction *I = dyn_cast(Curr); + if (!I) + return false; + + if (!Stack.empty() && Stack.back() == I) { + // Already handled all instruction operands, can remove it from both, the + // Worklist and the Stack, and add it to the instruction info map. + Worklist.pop_back(); + Stack.pop_back(); + // Insert I to the Info map. + InstInfoMap.insert(std::make_pair(I, Info())); + continue; + } + + if (InstInfoMap.count(I)) { + Worklist.pop_back(); + continue; + } + + // Add the instruction to the stack before start handling its operands. + Stack.push_back(I); + + unsigned Opc = I->getOpcode(); + switch (Opc) { + case Instruction::Trunc: + case Instruction::ZExt: + case Instruction::SExt: + // trunc(trunc(x)) -> trunc(x) + // trunc(ext(x)) -> ext(x) if the source type is smaller than the new dest + // trunc(ext(x)) -> trunc(x) if the source type is larger than the new + // dest + break; + case Instruction::Add: + case Instruction::Sub: + case Instruction::Mul: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + SmallVector Operands; + getRelevantOperands(I, Operands); + for (Value *Operand : Operands) + Worklist.push_back(Operand); + break; + } + default: + // TODO: Can handle more cases here: + // 1. select, shufflevector, extractelement, insertelement + // 2. udiv, urem + // 3. shl, lshr, ashr + // 4. phi node(and loop handling) + // ... + return false; + } + } + return true; +} + +unsigned TruncInstCombine::getMinBitWidth() { + SmallVector Worklist; + SmallVector Stack; + + Value *Src = CurrentTruncInst->getOperand(0); + Type *DstTy = CurrentTruncInst->getType(); + unsigned TruncBitWidth = DstTy->getScalarSizeInBits(); + unsigned OrigBitWidth = + CurrentTruncInst->getOperand(0)->getType()->getScalarSizeInBits(); + + if (isa(Src)) + return TruncBitWidth; + + Worklist.push_back(Src); + InstInfoMap[cast(Src)].ValidBitWidth = TruncBitWidth; + + while (!Worklist.empty()) { + Value *Curr = Worklist.back(); + + if (isa(Curr)) { + Worklist.pop_back(); + continue; + } + + // Otherwise, it must be an instruction. + Instruction *I = cast(Curr); + + auto &Info = InstInfoMap[I]; + + SmallVector Operands; + getRelevantOperands(I, Operands); + + if (!Stack.empty() && Stack.back() == I) { + // Already handled all instruction operands, can remove it from both, the + // Worklist and the Stack, and update MinBitWidth. + Worklist.pop_back(); + Stack.pop_back(); + for (auto *Operand : Operands) + if (auto IOp = dyn_cast(Operand)) + Info.MinBitWidth = + std::max(Info.MinBitWidth, InstInfoMap[IOp].MinBitWidth); + continue; + } + + // Add the instruction to the stack before start handling its operands. + Stack.push_back(I); + unsigned ValidBitWidth = Info.ValidBitWidth; + + // Update minimum bit-width before handling its operands. This is required + // when the instruction is part of a loop. + Info.MinBitWidth = std::max(Info.MinBitWidth, Info.ValidBitWidth); + + for (auto *Operand : Operands) + if (auto IOp = dyn_cast(Operand)) { + // If we already calculated the minimum bit-width for this valid + // bit-width, or for a smaller valid bit-width, then just keep the + // answer we already calculated. + if (InstInfoMap[IOp].ValidBitWidth >= ValidBitWidth) + continue; + InstInfoMap[IOp].ValidBitWidth = + std::max(ValidBitWidth, InstInfoMap[IOp].ValidBitWidth); + Worklist.push_back(IOp); + } + } + unsigned MinBitWidth = InstInfoMap[cast(Src)].MinBitWidth; + assert(MinBitWidth >= TruncBitWidth); + + if (MinBitWidth > TruncBitWidth) { + // Use the smallest integer type in the range [MinBitWidth, OrigBitWidth). + Type *Ty = DL.getSmallestLegalIntType(DstTy->getContext(), MinBitWidth); + if (!Ty) + return OrigBitWidth; + // update minimum bit-width with the new destination type bit-width. + MinBitWidth = Ty->getScalarSizeInBits(); + } else { // MinBitWidth == ValidBitWidth + // In this case the expression can be evaluated with the trunc instruction + // destination type, and trunc instruction can be omitted. However, we + // should not perform the evaluation if the original type is a legal scalar + // type and the target type is illegal. + bool FromLegal = MinBitWidth == 1 || DL.isLegalInteger(OrigBitWidth); + bool ToLegal = MinBitWidth == 1 || DL.isLegalInteger(MinBitWidth); + if (!DstTy->isVectorTy() && FromLegal && !ToLegal) + return OrigBitWidth; + } + return MinBitWidth; +} + +Type *TruncInstCombine::getBestTruncatedType() { + if (!buildTruncExpressionDag()) + return nullptr; + + // We don't want to duplicate instructions, which isn't profitable. Thus, we + // can't shrink something that has multiple users, unless all users are + // post-dominated by the trunc instruction, i.e., were visited during the + // expression evaluation. + unsigned DesiredBitWidth = 0; + for (auto Itr : InstInfoMap) { + Instruction *I = Itr.first; + if (I->hasOneUse()) + continue; + bool IsExtInst = (isa(I) || isa(I)); + for (auto *U : I->users()) + if (auto *UI = dyn_cast(U)) + if (UI != CurrentTruncInst && !InstInfoMap.count(UI)) { + if (!IsExtInst) + return nullptr; + // If this is an extension from the dest type, we can eliminate it, + // even if it has multiple users. Thus, update the DesiredBitWidth and + // validate all extension instructions agrees on same DesiredBitWidth. + unsigned ExtInstBitWidth = + I->getOperand(0)->getType()->getScalarSizeInBits(); + if (DesiredBitWidth && DesiredBitWidth != ExtInstBitWidth) + return nullptr; + DesiredBitWidth = ExtInstBitWidth; + } + } + + unsigned OrigBitWidth = + CurrentTruncInst->getOperand(0)->getType()->getScalarSizeInBits(); + + // Calculate minimum bit-width needed to calculate the valid bit-bidth. + unsigned MinBitWidth = getMinBitWidth(); + + // Check that we can shrink to smaller bit-width than original one and that + // it is similar to the DesiredBitWidth is such exists. + if (MinBitWidth >= OrigBitWidth || + (DesiredBitWidth && DesiredBitWidth != MinBitWidth)) + return nullptr; + + return IntegerType::get(CurrentTruncInst->getContext(), MinBitWidth); +} + +/// Given a reduced scalar type \p Ty and a \p V value, return a reduced type +/// for \p V, according to its type, if it vector type, return the vector +/// version of \p Ty, otherwise return \p Ty. +static Type *getReducedType(Value *V, Type *Ty) { + assert(Ty && !Ty->isVectorTy() && "Expect Scalar Type"); + if (auto *VTy = dyn_cast(V->getType())) + return VectorType::get(Ty, VTy->getNumElements()); + return Ty; +} + +Value *TruncInstCombine::getReducedOperand(Value *V, Type *SclTy) { + Type *Ty = getReducedType(V, SclTy); + if (Constant *C = dyn_cast(V)) { + C = ConstantExpr::getIntegerCast(C, Ty, false); + // If we got a constantexpr back, try to simplify it with DL info. + if (Constant *FoldedC = ConstantFoldConstant(C, DL, &TLI)) + C = FoldedC; + return C; + } + + Instruction *I = cast(V); + Info Entry = InstInfoMap.lookup(I); + assert(Entry.NewValue); + return Entry.NewValue; +} + +void TruncInstCombine::ReduceExpressionDag(Type *SclTy) { + for (auto &Itr : InstInfoMap) { // Forward + Instruction *I = Itr.first; + + assert(!InstInfoMap.lookup(I).NewValue && "Instruction has been evaluated"); + + Instruction *Res = nullptr; + unsigned Opc = I->getOpcode(); + switch (Opc) { + case Instruction::Trunc: + case Instruction::ZExt: + case Instruction::SExt: { + Type *Ty = getReducedType(I, SclTy); + // If the source type of the cast is the type we're trying for then we can + // just return the source. There's no need to insert it because it is not + // new. + if (I->getOperand(0)->getType() == Ty) { + InstInfoMap[I].NewValue = I->getOperand(0); + continue; + } + // Otherwise, must be the same type of cast, so just reinsert a new one. + // This also handles the case of zext(trunc(x)) -> zext(x). + Res = CastInst::CreateIntegerCast(I->getOperand(0), Ty, + Opc == Instruction::SExt); + + // Update Worklist entries with new value if needed. + if (auto *NewCI = dyn_cast(Res)) { + auto Entry = find(Worklist, I); + if (Entry != Worklist.end()) + *Entry = NewCI; + } + break; + } + case Instruction::Add: + case Instruction::Sub: + case Instruction::Mul: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + Value *LHS = getReducedOperand(I->getOperand(0), SclTy); + Value *RHS = getReducedOperand(I->getOperand(1), SclTy); + Res = BinaryOperator::Create((Instruction::BinaryOps)Opc, LHS, RHS); + break; + } + default: + llvm_unreachable("Unhandled instruction"); + } + + InstInfoMap[I].NewValue = Res; + InsertAndUpdateNewInstWith(*Res, *I); + } + + Value *Res = getReducedOperand(CurrentTruncInst->getOperand(0), SclTy); + Type *DstTy = CurrentTruncInst->getType(); + Instruction *NewInst = nullptr; + if (Res->getType() != DstTy) { + NewInst = CastInst::CreateIntegerCast(Res, DstTy, false); + InsertAndUpdateNewInstWith(*NewInst, *CurrentTruncInst); + Res = NewInst; + } + CurrentTruncInst->replaceAllUsesWith(Res); + + // Erase old expression dag, which was replaced by the reduced expression dag. + // We iterate backward, which means we visit the instruction before we visit + // any of its operands, this way, when we get to the operand, we already + // removed the instructions (from the expression dag) that uses it. + CurrentTruncInst->eraseFromParent(); + for (auto I = InstInfoMap.rbegin(), E = InstInfoMap.rend(); I != E; ++I) { + // We still need to check that the instruction has no users before we erase + // it, because {SExt, ZExt}Int Instruction might have other users that was + // not reduced, in such case, we need to keep that instruction. + if (!I->first->getNumUses()) + I->first->eraseFromParent(); + } +} + +bool TruncInstCombine::run(Function &F) { + bool MadeIRChange = false; + + // Collect all TruncInst in the function into the Worklist for evaluating. + for (auto &BB : F) + for (auto &I : BB) + if (TruncInst *CI = dyn_cast(&I)) + Worklist.push_back(CI); + + // Process all TruncInst in the Worklist, for each instruction: + // 1. Check if it dominates an eligible expression dag to be reduced. + // 2. Create a reduced expression dag and replace the old one with it. + while (!Worklist.empty()) { + CurrentTruncInst = Worklist.pop_back_val(); + + if (Type *NewDstSclTy = getBestTruncatedType()) { + DEBUG(dbgs() << "ICE: TruncInstCombine reducing type of expression dag " + "dominated by: " + << CurrentTruncInst << '\n'); + ReduceExpressionDag(NewDstSclTy); + MadeIRChange = true; + } + } + + return MadeIRChange; +} Index: lib/Transforms/CMakeLists.txt =================================================================== --- lib/Transforms/CMakeLists.txt +++ lib/Transforms/CMakeLists.txt @@ -1,5 +1,6 @@ add_subdirectory(Utils) add_subdirectory(Instrumentation) +add_subdirectory(AggressiveInstCombine) add_subdirectory(InstCombine) add_subdirectory(Scalar) add_subdirectory(IPO) Index: lib/Transforms/IPO/LLVMBuild.txt =================================================================== --- lib/Transforms/IPO/LLVMBuild.txt +++ lib/Transforms/IPO/LLVMBuild.txt @@ -20,4 +20,4 @@ name = IPO parent = Transforms library_name = ipo -required_libraries = Analysis BitReader BitWriter Core InstCombine IRReader Linker Object ProfileData Scalar Support TransformUtils Vectorize Instrumentation +required_libraries = Analysis BitReader BitWriter Core InstCombine AggressiveInstCombine IRReader Linker Object ProfileData Scalar Support TransformUtils Vectorize Instrumentation Index: lib/Transforms/IPO/PassManagerBuilder.cpp =================================================================== --- lib/Transforms/IPO/PassManagerBuilder.cpp +++ lib/Transforms/IPO/PassManagerBuilder.cpp @@ -329,6 +329,7 @@ MPM.add(createCorrelatedValuePropagationPass()); // Propagate conditionals MPM.add(createCFGSimplificationPass()); // Merge & remove BBs // Combine silly seq's + MPM.add(createAggressiveInstCombinerPass()); addInstructionCombiningPass(MPM); if (SizeLevel == 0 && !DisableLibCallsShrinkWrap) MPM.add(createLibCallsShrinkWrapPass()); @@ -746,6 +747,7 @@ // simplification opportunities, and both can propagate functions through // function pointers. When this happens, we often have to resolve varargs // calls, etc, so let instcombine do this. + PM.add(createAggressiveInstCombinerPass()); addInstructionCombiningPass(PM); addExtensionsToPM(EP_Peephole, PM); Index: lib/Transforms/LLVMBuild.txt =================================================================== --- lib/Transforms/LLVMBuild.txt +++ lib/Transforms/LLVMBuild.txt @@ -16,7 +16,7 @@ ;===------------------------------------------------------------------------===; [common] -subdirectories = Coroutines IPO InstCombine Instrumentation Scalar Utils Vectorize ObjCARC +subdirectories = Coroutines IPO InstCombine AggressiveInstCombine Instrumentation Scalar Utils Vectorize ObjCARC [component_0] type = Group Index: lib/Transforms/Scalar/LLVMBuild.txt =================================================================== --- lib/Transforms/Scalar/LLVMBuild.txt +++ lib/Transforms/Scalar/LLVMBuild.txt @@ -20,4 +20,4 @@ name = Scalar parent = Transforms library_name = ScalarOpts -required_libraries = Analysis Core InstCombine Support TransformUtils +required_libraries = Analysis Core InstCombine AggressiveInstCombine Support TransformUtils Index: test/Transforms/AggressiveInstCombine/trunc_multi_uses.ll =================================================================== --- test/Transforms/AggressiveInstCombine/trunc_multi_uses.ll +++ test/Transforms/AggressiveInstCombine/trunc_multi_uses.ll @@ -0,0 +1,213 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -aggressive-instcombine -S | FileCheck %s +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" + +; Aggressive Instcombine should be able to reduce width of these expressions. + +declare i32 @use32(i32) +declare i32 @use64(i64) +declare <2 x i32> @use32_vec(<2 x i32>) +declare <2 x i32> @use64_vec(<2 x i64>) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; These tests check cases where expression dag post-dominated by TruncInst +;; contains instruction other than {zext, sext, trunc} has more than one usage. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +define void @multi_uses_add(i32 %X) { +; CHECK-LABEL: @multi_uses_add( +; CHECK-NEXT: [[A1:%.*]] = zext i32 [[X:%.*]] to i64 +; CHECK-NEXT: [[B1:%.*]] = add i32 [[X]], 15 +; CHECK-NEXT: [[C1:%.*]] = mul i32 [[B1]], [[B1]] +; CHECK-NEXT: [[R1:%.*]] = call i32 @use32(i32 [[C1]]) +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @use64(i64 [[A1]]) +; CHECK-NEXT: ret void +; + %A1 = zext i32 %X to i64 + %B1 = add i64 %A1, 15 + %C1 = mul i64 %B1, %B1 + %T1 = trunc i64 %C1 to i32 + call i32 @use32(i32 %T1) + ; make sure zext have another use that is not post-dominated by the TruncInst. + call i32 @use64(i64 %A1) + ret void +} + +define void @multi_uses_or(i32 %X) { +; CHECK-LABEL: @multi_uses_or( +; CHECK-NEXT: [[A1:%.*]] = zext i32 [[X:%.*]] to i64 +; CHECK-NEXT: [[B1:%.*]] = or i32 [[X]], 15 +; CHECK-NEXT: [[C1:%.*]] = mul i32 [[B1]], [[B1]] +; CHECK-NEXT: [[R1:%.*]] = call i32 @use32(i32 [[C1]]) +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @use64(i64 [[A1]]) +; CHECK-NEXT: ret void +; + %A1 = zext i32 %X to i64 + %B1 = or i64 %A1, 15 + %C1 = mul i64 %B1, %B1 + %T1 = trunc i64 %C1 to i32 + call i32 @use32(i32 %T1) + ; make sure zext have another use that is not post-dominated by the TruncInst. + call i32 @use64(i64 %A1) + ret void +} + +define void @multi_uses_xor(i32 %X) { +; CHECK-LABEL: @multi_uses_xor( +; CHECK-NEXT: [[A1:%.*]] = zext i32 [[X:%.*]] to i64 +; CHECK-NEXT: [[B1:%.*]] = xor i32 [[X]], 15 +; CHECK-NEXT: [[C1:%.*]] = mul i32 [[B1]], [[B1]] +; CHECK-NEXT: [[R1:%.*]] = call i32 @use32(i32 [[C1]]) +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @use64(i64 [[A1]]) +; CHECK-NEXT: ret void +; + %A1 = zext i32 %X to i64 + %B1 = xor i64 %A1, 15 + %C1 = mul i64 %B1, %B1 + %T1 = trunc i64 %C1 to i32 + call i32 @use32(i32 %T1) + ; make sure zext have another use that is not post-dominated by the TruncInst. + call i32 @use64(i64 %A1) + ret void +} + +define void @multi_uses_and(i32 %X) { +; CHECK-LABEL: @multi_uses_and( +; CHECK-NEXT: [[A1:%.*]] = zext i32 [[X:%.*]] to i64 +; CHECK-NEXT: [[B1:%.*]] = and i32 [[X]], 15 +; CHECK-NEXT: [[C1:%.*]] = mul i32 [[B1]], [[B1]] +; CHECK-NEXT: [[R1:%.*]] = call i32 @use32(i32 [[C1]]) +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @use64(i64 [[A1]]) +; CHECK-NEXT: ret void +; + %A1 = zext i32 %X to i64 + %B1 = and i64 %A1, 15 + %C1 = mul i64 %B1, %B1 + %T1 = trunc i64 %C1 to i32 + call i32 @use32(i32 %T1) + ; make sure zext have another use that is not post-dominated by the TruncInst. + call i32 @use64(i64 %A1) + ret void +} + +define void @multi_uses_sub(i32 %X, i32 %Y) { +; CHECK-LABEL: @multi_uses_sub( +; CHECK-NEXT: [[A1:%.*]] = zext i32 [[X:%.*]] to i64 +; CHECK-NEXT: [[A2:%.*]] = zext i32 [[Y:%.*]] to i64 +; CHECK-NEXT: [[B1:%.*]] = sub i32 [[X]], [[Y]] +; CHECK-NEXT: [[C1:%.*]] = mul i32 [[B1]], [[B1]] +; CHECK-NEXT: [[R1:%.*]] = call i32 @use32(i32 [[C1]]) +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @use64(i64 [[A1]]) +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @use64(i64 [[A2]]) +; CHECK-NEXT: ret void +; + %A1 = zext i32 %X to i64 + %A2 = zext i32 %Y to i64 + %B1 = sub i64 %A1, %A2 + %C1 = mul i64 %B1, %B1 + %T1 = trunc i64 %C1 to i32 + call i32 @use32(i32 %T1) + ; make sure zext have another use that is not post-dominated by the TruncInst. + call i32 @use64(i64 %A1) + call i32 @use64(i64 %A2) + ret void +} + +define void @multi_use_vec_add(<2 x i32> %X) { +; CHECK-LABEL: @multi_use_vec_add( +; CHECK-NEXT: [[A1:%.*]] = zext <2 x i32> [[X:%.*]] to <2 x i64> +; CHECK-NEXT: [[B1:%.*]] = add <2 x i32> [[X]], +; CHECK-NEXT: [[C1:%.*]] = mul <2 x i32> [[B1]], [[B1]] +; CHECK-NEXT: [[R1:%.*]] = call <2 x i32> @use32_vec(<2 x i32> [[C1]]) +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i32> @use64_vec(<2 x i64> [[A1]]) +; CHECK-NEXT: ret void +; + %A1 = zext <2 x i32> %X to <2 x i64> + %B1 = add <2 x i64> %A1, + %C1 = mul <2 x i64> %B1, %B1 + %T1 = trunc <2 x i64> %C1 to <2 x i32> + call <2 x i32> @use32_vec(<2 x i32> %T1) + ; make sure zext have another use that is not post-dominated by the TruncInst. + call <2 x i32> @use64_vec(<2 x i64> %A1) + ret void +} + +define void @multi_use_vec_or(<2 x i32> %X) { +; CHECK-LABEL: @multi_use_vec_or( +; CHECK-NEXT: [[A1:%.*]] = zext <2 x i32> [[X:%.*]] to <2 x i64> +; CHECK-NEXT: [[B1:%.*]] = or <2 x i32> [[X]], +; CHECK-NEXT: [[C1:%.*]] = mul <2 x i32> [[B1]], [[B1]] +; CHECK-NEXT: [[R1:%.*]] = call <2 x i32> @use32_vec(<2 x i32> [[C1]]) +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i32> @use64_vec(<2 x i64> [[A1]]) +; CHECK-NEXT: ret void +; + %A1 = zext <2 x i32> %X to <2 x i64> + %B1 = or <2 x i64> %A1, + %C1 = mul <2 x i64> %B1, %B1 + %T1 = trunc <2 x i64> %C1 to <2 x i32> + call <2 x i32> @use32_vec(<2 x i32> %T1) + ; make sure zext have another use that is not post-dominated by the TruncInst. + call <2 x i32> @use64_vec(<2 x i64> %A1) + ret void +} + +define void @multi_use_vec_xor(<2 x i32> %X) { +; CHECK-LABEL: @multi_use_vec_xor( +; CHECK-NEXT: [[A1:%.*]] = zext <2 x i32> [[X:%.*]] to <2 x i64> +; CHECK-NEXT: [[B1:%.*]] = xor <2 x i32> [[X]], +; CHECK-NEXT: [[C1:%.*]] = mul <2 x i32> [[B1]], [[B1]] +; CHECK-NEXT: [[R1:%.*]] = call <2 x i32> @use32_vec(<2 x i32> [[C1]]) +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i32> @use64_vec(<2 x i64> [[A1]]) +; CHECK-NEXT: ret void +; + %A1 = zext <2 x i32> %X to <2 x i64> + %B1 = xor <2 x i64> %A1, + %C1 = mul <2 x i64> %B1, %B1 + %T1 = trunc <2 x i64> %C1 to <2 x i32> + call <2 x i32> @use32_vec(<2 x i32> %T1) + ; make sure zext have another use that is not post-dominated by the TruncInst. + call <2 x i32> @use64_vec(<2 x i64> %A1) + ret void +} + +define void @multi_use_vec_and(<2 x i32> %X) { +; CHECK-LABEL: @multi_use_vec_and( +; CHECK-NEXT: [[A1:%.*]] = zext <2 x i32> [[X:%.*]] to <2 x i64> +; CHECK-NEXT: [[B1:%.*]] = and <2 x i32> [[X]], +; CHECK-NEXT: [[C1:%.*]] = mul <2 x i32> [[B1]], [[B1]] +; CHECK-NEXT: [[R1:%.*]] = call <2 x i32> @use32_vec(<2 x i32> [[C1]]) +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i32> @use64_vec(<2 x i64> [[A1]]) +; CHECK-NEXT: ret void +; + %A1 = zext <2 x i32> %X to <2 x i64> + %B1 = and <2 x i64> %A1, + %C1 = mul <2 x i64> %B1, %B1 + %T1 = trunc <2 x i64> %C1 to <2 x i32> + call <2 x i32> @use32_vec(<2 x i32> %T1) + ; make sure zext have another use that is not post-dominated by the TruncInst. + call <2 x i32> @use64_vec(<2 x i64> %A1) + ret void +} + +define void @multi_use_vec_sub(<2 x i32> %X, <2 x i32> %Y) { +; CHECK-LABEL: @multi_use_vec_sub( +; CHECK-NEXT: [[A1:%.*]] = zext <2 x i32> [[X:%.*]] to <2 x i64> +; CHECK-NEXT: [[A2:%.*]] = zext <2 x i32> [[Y:%.*]] to <2 x i64> +; CHECK-NEXT: [[B1:%.*]] = sub <2 x i32> [[X]], [[Y]] +; CHECK-NEXT: [[C1:%.*]] = mul <2 x i32> [[B1]], [[B1]] +; CHECK-NEXT: [[R1:%.*]] = call <2 x i32> @use32_vec(<2 x i32> [[C1]]) +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i32> @use64_vec(<2 x i64> [[A1]]) +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i32> @use64_vec(<2 x i64> [[A2]]) +; CHECK-NEXT: ret void +; + %A1 = zext <2 x i32> %X to <2 x i64> + %A2 = zext <2 x i32> %Y to <2 x i64> + %B1 = sub <2 x i64> %A1, %A2 + %C1 = mul <2 x i64> %B1, %B1 + %T1 = trunc <2 x i64> %C1 to <2 x i32> + call <2 x i32> @use32_vec(<2 x i32> %T1) + ; make sure zext have another use that is not post-dominated by the TruncInst. + call <2 x i32> @use64_vec(<2 x i64> %A1) + call <2 x i32> @use64_vec(<2 x i64> %A2) + ret void +} Index: tools/opt/opt.cpp =================================================================== --- tools/opt/opt.cpp +++ tools/opt/opt.cpp @@ -387,6 +387,7 @@ initializeAnalysis(Registry); initializeTransformUtils(Registry); initializeInstCombine(Registry); + initializeAggressiveInstCombinerPass(Registry); initializeInstrumentation(Registry); initializeTarget(Registry); // For codegen passes, only passes that do IR to IR transformation are