Index: llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -93,6 +93,7 @@ #include <cstdint> #include <iterator> #include <memory> +#include <queue> #include <set> #include <string> #include <tuple> @@ -125,6 +126,11 @@ cl::desc( "Attempt to vectorize horizontal reductions feeding into a store")); +static cl::opt<unsigned> + SLPThrottleBudget("slp-throttling-budget", cl::init(32), cl::Hidden, + cl::desc("Limit the total number of nodes for cost " + "recalculations during throttling")); + static cl::opt<int> MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits")); @@ -634,11 +640,61 @@ /// \returns the cost incurred by unwanted spills and fills, caused by /// holding live values over call sites. - InstructionCost getSpillCost() const; + InstructionCost getSpillCost(); + + /// \returns the cost extracting vectorized elements. + InstructionCost getExtractShuffleCost(InstructionCost Cost); + + /// \returns the cost of gathering canceled elements to be used + /// by vectorized operations during throttling. + InstructionCost getInsertCost(ArrayRef<Value *> VectorizedVals); + + /// Find a non-gathering leaf node from current node C and record the path + /// on the way. + void findLeaf(TreeEntry *C, SetVector<TreeEntry *> &Path) const; + + using SubTreeQueue = + std::priority_queue<std::pair<int, std::vector<TreeEntry *>>>; + + /// Find a subtree of the whole tree suitable to be vectorized. When + /// vectorizing the whole tree is not profitable, we can consider vectorizing + /// part of that tree. SLP algorithm looks to operations to vectorize starting + /// from seed instructions on the bottom toward the end of chains of + /// dependencies to the top of SLP graph, it groups potentially vectorizable + /// operations in scalar form to bundles. + /// For example: + /// + /// <bundle 1> vector form + /// | + /// <bundle 2> vector form <bundle 3> vector form + /// \ / + /// <seed root bundle> vector form + /// + /// Total cost is not profitable to vectorize, hence all operations are in + /// scalar form. + /// + /// Here is the same tree after SLP throttling transformation: + /// + /// <bundle 1> vector form + /// | + /// <bundle 2> vector form <bundle 3> gathered nodes + /// \ / + /// <seed root bundle> vector form + /// + /// So, we can throttle some operations in such a way that it is still + /// profitable to vectorize part on the tree, while all tree vectorization + /// does not make sense. + /// More details: + /// https://www.cl.cam.ac.uk/~tmj32/papers/docs/porpodas15-pact.pdf + bool findSubTree(SubTreeQueue &SubTrees, InstructionCost TreeCost); + + /// Get raw summary of all elements of the tree. + InstructionCost getRawTreeCost(ArrayRef<Value *> VectorizedVals = None); /// \returns the vectorization cost of the subtree that starts at \p VL. /// A negative number means that this is profitable. - InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = None); + InstructionCost getTreeCost(bool TreeReduce, + ArrayRef<Value *> VectorizedVals = None); /// Construct a vectorizable tree that starts at \p Roots, ignoring users for /// the purpose of scheduling and extraction in the \p UserIgnoreLst. @@ -659,6 +715,8 @@ ScalarToTreeEntry.clear(); MustGather.clear(); ExternalUses.clear(); + InternalTreeUses.clear(); + ProposedToGather.clear(); NumOpsWantToKeepOrder.clear(); NumOpsWantToKeepOriginalOrder = 0; for (auto &Iter : BlocksSchedules) { @@ -667,6 +725,10 @@ } MinBWs.clear(); InstrElementSize.clear(); + NoCallInst = true; + RawTreeCost = 0; + ReduceableCost = 0; + IsCostSumReady = false; } unsigned getTreeSize() const { return VectorizableTree.size(); } @@ -829,6 +891,9 @@ /// may not be necessary. bool isLoadCombineCandidate() const; + /// Cut the tree to make it partially vectorizable. + void cutTree(); + OptimizationRemarkEmitter *getORE() { return ORE; } /// This structure holds any data we need about the edges being traversed @@ -1647,6 +1712,9 @@ /// Does this entry require reordering? SmallVector<unsigned, 4> ReorderIndices; + /// Cost of this tree entry. + InstructionCost Cost = 0; + /// Points back to the VectorizableTree. /// /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has @@ -1659,6 +1727,9 @@ /// have multiple users so the data structure is not truly a tree. SmallVector<EdgeInfo, 1> UserTreeIndices; + /// Use of this entry. + TinyPtrVector<TreeEntry *> UseEntries; + /// The index of this treeEntry in VectorizableTree. int Idx = -1; @@ -1780,6 +1851,13 @@ return FoundLane; } + // Find nodes with more than one use. + bool isFork() const { + return llvm::count_if(UseEntries, [this](TreeEntry *Next) { + return (Next->Idx != Idx && Next->State != TreeEntry::NeedToGather); + }) > 1; + } + #ifndef NDEBUG /// Debug printer. LLVM_DUMP_METHOD void dump() const { @@ -1804,6 +1882,8 @@ dbgs() << "NeedToGather\n"; break; } + dbgs() << "Cost: "; + dbgs() << Cost << "\n"; dbgs() << "MainOp: "; if (MainOp) dbgs() << *MainOp << "\n"; @@ -1902,8 +1982,10 @@ MustGather.insert(VL.begin(), VL.end()); } - if (UserTreeIdx.UserTE) + if (UserTreeIdx.UserTE) { Last->UserTreeIndices.push_back(UserTreeIdx); + VectorizableTree[UserTreeIdx.UserTE->Idx]->UseEntries.push_back(Last); + } return Last; } @@ -1953,6 +2035,13 @@ }; using UserList = SmallVector<ExternalUser, 16>; + /// \returns the cost of extracting the vectorized elements. + InstructionCost + getExtractOperationCost(const ExternalUser &EU, + SmallVectorImpl<SmallVector<int>> &ShuffleMask, + SmallVectorImpl<Value *> &FirstUsers, + SmallVectorImpl<APInt> &DemandedElts); + /// Checks if two instructions may access the same memory. /// /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it @@ -2000,6 +2089,27 @@ /// after vectorization. UserList ExternalUses; + /// Tree entries that should not be vectorized due to throttling. + SmallPtrSet<TreeEntry *, 2> ProposedToGather; + + /// Raw cost of all elemts in the tree. + InstructionCost RawTreeCost = 0; + + InstructionCost ReduceableCost = 0; + + /// Indicate that no CallInst found in the tree and we don't need to + /// calculate spill cost. + bool NoCallInst = true; + + /// True, if we have calucalted tree cost for the tree. + bool IsCostSumReady = false; + + /// Current operations width to vectorize. + unsigned BundleWidth = 0; + + /// Internal tree oprations proposed to be vectorized values use. + SmallDenseMap<Value *, UserList> InternalTreeUses; + /// Values used only by @llvm.assume calls. SmallPtrSet<const Value *, 32> EphValues; @@ -2342,6 +2452,9 @@ /// Sets all instruction in the scheduling region to un-scheduled. void resetSchedule(); + /// Make the scheduling region smaller. + void reduceSchedulingRegion(Instruction *Start, Instruction *End); + BasicBlock *BB; /// Simple memory allocation for ScheduleData. @@ -2404,6 +2517,9 @@ /// performed in a basic block. void scheduleBlock(BlockScheduling *BS); + /// Remove operations from the list of proposed to schedule. + void removeFromScheduling(BlockScheduling *BS); + /// List of users to ignore during scheduling and that don't need extracting. ArrayRef<Value *> UserIgnoreList; @@ -2537,6 +2653,7 @@ std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) { std::string Str; raw_string_ostream OS(Str); + OS << "Idx: " << Entry->Idx << "\n"; if (isSplat(Entry->Scalars)) { OS << "<splat> " << *Entry->Scalars[0]; return Str; @@ -2607,7 +2724,7 @@ buildTree_rec(Roots, 0, EdgeInfo()); // Collect the values that we need to extract from the tree. - for (auto &TEPtr : VectorizableTree) { + for (std::unique_ptr<TreeEntry> &TEPtr : VectorizableTree) { TreeEntry *Entry = TEPtr.get(); // No need to handle users of gathered values. @@ -2642,6 +2759,7 @@ // Some in-tree scalars will remain as scalar in vectorized // instructions. If that is the case, the one in Lane 0 will // be used. + InternalTreeUses[U].emplace_back(Scalar, U, FoundLane); if (UseScalar != U || UseEntry->State == TreeEntry::ScatterVectorize || !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) { @@ -3429,6 +3547,49 @@ } } +void BoUpSLP::cutTree() { + SmallVector<TreeEntry *, 4> VecNodes; + + for (std::unique_ptr<TreeEntry> &TEPtr : VectorizableTree) { + TreeEntry *Entry = TEPtr.get(); + if (Entry->State == TreeEntry::NeedToGather) + continue; + // For all canceled operations we should consider the possibility of + // use by with non-canceled operations and for that, it requires + // to populate ExternalUser list with canceled elements. + for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { + Value *Scalar = Entry->Scalars[Lane]; + for (User *U : Scalar->users()) { + LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n"); + TreeEntry *UserTE = getTreeEntry(U); + if (!UserTE || ProposedToGather.count(UserTE) == 0) + continue; + // Ignore users in the user ignore list. + auto *UserInst = dyn_cast<Instruction>(U); + if (!UserInst) + continue; + + if (is_contained(UserIgnoreList, UserInst)) + continue; + LLVM_DEBUG(dbgs() << "SLP: Need extract to canceled operation :" << *U + << " from lane " << Lane << " from " << *Scalar + << ".\n"); + ExternalUses.emplace_back(Scalar, U, Lane); + } + } + } + // Canceling unprofitable elements. + for (TreeEntry *Entry : ProposedToGather) { + for (Value *V : Entry->Scalars) { + ScalarToTreeEntry.erase(V); +#ifndef NDEBUG + LLVM_DEBUG(dbgs() << "SLP: Remove scalar " << *V + << " out of proposed to vectorize.\n"); +#endif + } + } +} + unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const { unsigned N = 1; Type *EltTy = T; @@ -3747,7 +3908,7 @@ SmallVector<const TreeEntry *> Entries; Optional<TargetTransformInfo::ShuffleKind> Shuffle = isGatherShuffledEntry(E, Mask, Entries); - if (Shuffle.hasValue()) { + if (Shuffle.hasValue() && ProposedToGather.count(E) == 0) { InstructionCost GatherCost = 0; if (ShuffleVectorInst::isIdentityMask(Mask)) { // Perfect match in the graph, will reuse the previously vectorized @@ -4356,12 +4517,215 @@ return true; } -InstructionCost BoUpSLP::getSpillCost() const { +InstructionCost +BoUpSLP::getExtractOperationCost(const ExternalUser &EU, + SmallVectorImpl<SmallVector<int>> &ShuffleMask, + SmallVectorImpl<Value *> &FirstUsers, + SmallVectorImpl<APInt> &DemandedElts) { + InstructionCost ExtractCost = 0; + SmallVector<unsigned> VF; + + // If found user is an insertelement, do not calculate extract cost but try + // to detect it as a final shuffled/identity match. + if (EU.User && isa<InsertElementInst>(EU.User)) { + if (auto *FTy = dyn_cast<FixedVectorType>(EU.User->getType())) { + Optional<int> InsertIdx = getInsertIndex(EU.User, 0); + if (!InsertIdx || *InsertIdx == UndefMaskElem) + return 0; + Value *VU = EU.User; + auto *It = find_if(FirstUsers, [VU](Value *V) { + // Checks if 2 insertelements are from the same buildvector. + if (VU->getType() != V->getType()) + return false; + auto *IE1 = cast<InsertElementInst>(VU); + auto *IE2 = cast<InsertElementInst>(V); + // Go though of insertelement instructions trying to find either VU as + // the original vector for IE2 or V as the original vector for IE1. + do { + if (IE1 == VU || IE2 == V) + return true; + if (IE1) + IE1 = dyn_cast<InsertElementInst>(IE1->getOperand(0)); + if (IE2) + IE2 = dyn_cast<InsertElementInst>(IE2->getOperand(0)); + } while (IE1 || IE2); + return false; + }); + int VecId = -1; + if (It == FirstUsers.end()) { + VF.push_back(FTy->getNumElements()); + ShuffleMask.emplace_back(VF.back(), UndefMaskElem); + FirstUsers.push_back(EU.User); + DemandedElts.push_back(APInt::getZero(VF.back())); + VecId = FirstUsers.size() - 1; + } else { + VecId = std::distance(FirstUsers.begin(), It); + } + int Idx = *InsertIdx; + ShuffleMask[VecId][Idx] = EU.Lane; + DemandedElts[VecId].setBit(Idx); + } + } + // If we plan to rewrite the tree in a smaller type, we will need to sign + // extend the extracted value back to the original type. Here, we account + // for the extract and the added cost of the sign extend if needed. + auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth); + auto *ScalarRoot = VectorizableTree[0]->Scalars[0]; + if (MinBWs.count(ScalarRoot)) { + auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first); + auto Extend = + MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt; + VecTy = FixedVectorType::get(MinTy, BundleWidth); + ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(), + VecTy, EU.Lane); + } else { + ExtractCost += + TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane); + } + return ExtractCost; +} + +InstructionCost BoUpSLP::getExtractShuffleCost(InstructionCost Cost) { + InstructionCost ExtractCost = 0; + InstructionCost ShuffleCost = 0; + SmallPtrSet<Value *, 16> ExtractCostCalculated; + SmallVector<SmallVector<int>> ShuffleMask; + SmallVector<Value *> FirstUsers; + SmallVector<APInt> DemandedElts; + for (const ExternalUser &EU : ExternalUses) { + // We only add extract cost once for the same scalar. + if (!ExtractCostCalculated.insert(EU.Scalar).second) + continue; + + // Uses by ephemeral values are free (because the ephemeral value will be + // removed prior to code generation, and so the extraction will be + // removed as well). + if (EphValues.count(EU.User)) + continue; + + // No extract cost for vector "scalar" + if (isa<FixedVectorType>(EU.Scalar->getType())) + continue; + + // Already counted the cost for external uses when tried to adjust the cost + // for extractelements, no need to add it again. + if (isa<ExtractElementInst>(EU.Scalar)) + continue; + + ExtractCost += + getExtractOperationCost(EU, ShuffleMask, FirstUsers, DemandedElts); + } + + // Consider the possibility of extracting vectorized + // values for canceled elements use. + for (TreeEntry *Entry : ProposedToGather) { + for (Value *V : Entry->Scalars) { + // Consider the possibility of extracting vectorized + // values for canceled elements use. + auto It = InternalTreeUses.find(V); + if (It != InternalTreeUses.end()) { + const UserList &UL = It->second; + for (const ExternalUser &IU : UL) + if (!ExtractCostCalculated.insert(IU.Scalar).second) + ExtractCost += getExtractOperationCost(IU, ShuffleMask, FirstUsers, + DemandedElts); + } + } + } + + for (int I = 0, E = FirstUsers.size(); I < E; ++I) { + // For the very first element - simple shuffle of the source vector. + int Limit = ShuffleMask[I].size() * 2; + if (I == 0 && + all_of(ShuffleMask[I], [Limit](int Idx) { return Idx < Limit; }) && + !ShuffleVectorInst::isIdentityMask(ShuffleMask[I])) { + InstructionCost C = TTI->getShuffleCost( + TTI::SK_PermuteSingleSrc, + cast<FixedVectorType>(FirstUsers[I]->getType()), ShuffleMask[I]); + LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C + << " for final shuffle of insertelement external users " + << *VectorizableTree.front()->Scalars.front() << ".\n" + << "SLP: Current total cost = " << Cost << "\n"); + ShuffleCost += C; + continue; + } + // Other elements - permutation of 2 vectors (the initial one and the next + // Ith incoming vector). + unsigned VF = ShuffleMask[I].size(); + for (unsigned Idx = 0; Idx < VF; ++Idx) { + int &Mask = ShuffleMask[I][Idx]; + Mask = Mask == UndefMaskElem ? Idx : VF + Mask; + } + InstructionCost C = TTI->getShuffleCost( + TTI::SK_PermuteTwoSrc, cast<FixedVectorType>(FirstUsers[I]->getType()), + ShuffleMask[I]); + LLVM_DEBUG( + dbgs() + << "SLP: Adding cost " << C + << " for final shuffle of vector node and external insertelement users " + << *VectorizableTree.front()->Scalars.front() << ".\n" + << "SLP: Current total cost = " << Cost << "\n"); + ShuffleCost += C; + InstructionCost InsertCost = TTI->getScalarizationOverhead( + cast<FixedVectorType>(FirstUsers[I]->getType()), DemandedElts[I], + /*Insert*/ true, + /*Extract*/ false); + ShuffleCost -= InsertCost; + LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost + << " for insertelements gather.\n" + << "SLP: Current total cost = " << Cost << "\n"); + } + + return ExtractCost + ShuffleCost; +} + +InstructionCost BoUpSLP::getInsertCost(ArrayRef<Value *> VectorizedVals) { + InstructionCost InsertCost = 0; + for (TreeEntry *Entry : ProposedToGather) { + // Avoid already vectorized TreeEntries, it is already in a vector form and + // we don't need to gather those operations or nodes that were once + // considered to be vectorized but now don't have any direct relations + // to vectorizable nodes. + for (Value *V : Entry->Scalars) { + auto *Inst = cast<Instruction>(V); + if (llvm::any_of(Inst->users(), [this](User *Op) { + if (const TreeEntry *UserTE = getTreeEntry(Op)) { + return (ProposedToGather.count(UserTE) != 0); + } + return false; + })) { + InsertCost += getEntryCost(Entry, VectorizedVals); + break; + } + } + } + return InsertCost; +} + +void BoUpSLP::findLeaf(TreeEntry *C, SetVector<TreeEntry *> &Path) const { + if (!Path.count(C)) + Path.insert(C); + int NonGatherUse; + do { + NonGatherUse = 0; + for (TreeEntry *Next : llvm::reverse(C->UseEntries)) { + // Ignore any processed nodes to avoid cycles. + if (Next->State == TreeEntry::NeedToGather || Path.count(Next) || + Next == C) + continue; + C = Next; + Path.insert(C); + NonGatherUse++; + break; + } + } while (NonGatherUse != 0); +} + +InstructionCost BoUpSLP::getSpillCost() { // Walk from the bottom of the tree to the top, tracking which values are // live. When we see a call instruction that is not part of our tree, // query TTI to see if there is a cost to keeping values live over it // (for example, if spills and fills are required). - unsigned BundleWidth = VectorizableTree.front()->Scalars.size(); InstructionCost Cost = 0; SmallPtrSet<Instruction*, 4> LiveValues; @@ -4434,6 +4798,7 @@ } if (NumCalls) { + NoCallInst = false; SmallVector<Type*, 4> V; for (auto *II : LiveValues) { auto *ScalarTy = II->getType(); @@ -4450,155 +4815,105 @@ return Cost; } -InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { - InstructionCost Cost = 0; - LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size " - << VectorizableTree.size() << ".\n"); +bool BoUpSLP::findSubTree(SubTreeQueue &SubTrees, InstructionCost TreeCost) { + SetVector<TreeEntry *> Path; + std::vector<TreeEntry *> SubPath; + SetVector<TreeEntry *> Visited; + TreeEntry *Node = VectorizableTree.front().get(); - unsigned BundleWidth = VectorizableTree[0]->Scalars.size(); + // Avoid reducing the tree if there is no potential room to reduce. + if ((TreeCost - ReduceableCost) >= -SLPCostThreshold) + return false; - for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) { - TreeEntry &TE = *VectorizableTree[I].get(); + // To start we can find just one leaf node that happens to be not the root + // node of the graph i.e. with non-zero index. Then, Path is route from the + // root node to our leaf node. + findLeaf(Node, Path); + if (Node == Path.back()) + return false; + do { + Node = Path.back(); + Visited.insert(Node); + assert(Node->State != TreeEntry::NeedToGather && "Incorrect node state"); + // If we found a branch node i.e. node with more than one non-gathering + // child, we could try to find set of profitable nodes in SubPath to + // vectorize and if there is no such set of profitable nodes then we could + // consider another leaf that is reachable from this branch node. + if (Node->isFork()) { + TreeEntry *NextFromFork = nullptr; + auto It = llvm::find_if( + llvm::reverse(Node->UseEntries), [&Node, &Visited](TreeEntry *E) { + return (E != Node && E->State != TreeEntry::NeedToGather && + !Visited.count(E)); + }); + if (It != Node->UseEntries.rend()) + NextFromFork = *It; + else + Path.pop_back(); + + InstructionCost SubTreeCost = 0; + for (TreeEntry *Entry : SubPath) + SubTreeCost += Entry->Cost; + if (SubTreeCost.isValid() && SubTreeCost > 0) + SubTrees.push(std::make_pair(*SubTreeCost.getValue(), SubPath)); + SubPath.clear(); + if (NextFromFork && NextFromFork != Node) { + findLeaf(NextFromFork, Path); + Node = Path.back(); + } + } else { + // If this node is not a branch node then we could move to another node + // below until we reach the root node of the graph or encounter another + // branch node. + SubPath.push_back(Node); + Path.pop_back(); + } + } while (Node->Idx); - InstructionCost C = getEntryCost(&TE, VectorizedVals); - Cost += C; - LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C - << " for bundle that starts with " << *TE.Scalars[0] - << ".\n" - << "SLP: Current total cost = " << Cost << "\n"); - } + return (SubTrees.size() > 0); +} - SmallPtrSet<Value *, 16> ExtractCostCalculated; - InstructionCost ExtractCost = 0; - SmallVector<unsigned> VF; - SmallVector<SmallVector<int>> ShuffleMask; - SmallVector<Value *> FirstUsers; - SmallVector<APInt> DemandedElts; - for (ExternalUser &EU : ExternalUses) { - // We only add extract cost once for the same scalar. - if (!ExtractCostCalculated.insert(EU.Scalar).second) - continue; +InstructionCost BoUpSLP::getRawTreeCost(ArrayRef<Value *> VectorizedVals) { + InstructionCost CostSum = 0; + BundleWidth = VectorizableTree.front()->Scalars.size(); + LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size " + << VectorizableTree.size() << ".\n"); - // Uses by ephemeral values are free (because the ephemeral value will be - // removed prior to code generation, and so the extraction will be - // removed as well). - if (EphValues.count(EU.User)) - continue; + for (std::unique_ptr<TreeEntry> &TEPtr : VectorizableTree) { + TreeEntry &TE = *TEPtr.get(); - // No extract cost for vector "scalar" - if (isa<FixedVectorType>(EU.Scalar->getType())) - continue; + TE.Cost = getEntryCost(&TE, VectorizedVals); + LLVM_DEBUG(dbgs() << "SLP: Adding cost " << TE.Cost + << " for bundle that starts with " << *TE.Scalars[0] + << ".\n"); + CostSum += TE.Cost; + LLVM_DEBUG(dbgs() << "SLP: Current total cost = " << CostSum << "\n"); - // Already counted the cost for external uses when tried to adjust the cost - // for extractelements, no need to add it again. - if (isa<ExtractElementInst>(EU.Scalar)) + if (TE.State == TreeEntry::NeedToGather) continue; - // If found user is an insertelement, do not calculate extract cost but try - // to detect it as a final shuffled/identity match. - if (EU.User && isa<InsertElementInst>(EU.User)) { - if (auto *FTy = dyn_cast<FixedVectorType>(EU.User->getType())) { - Optional<int> InsertIdx = getInsertIndex(EU.User, 0); - if (!InsertIdx || *InsertIdx == UndefMaskElem) - continue; - Value *VU = EU.User; - auto *It = find_if(FirstUsers, [VU](Value *V) { - // Checks if 2 insertelements are from the same buildvector. - if (VU->getType() != V->getType()) - return false; - auto *IE1 = cast<InsertElementInst>(VU); - auto *IE2 = cast<InsertElementInst>(V); - // Go though of insertelement instructions trying to find either VU as - // the original vector for IE2 or V as the original vector for IE1. - do { - if (IE1 == VU || IE2 == V) - return true; - if (IE1) - IE1 = dyn_cast<InsertElementInst>(IE1->getOperand(0)); - if (IE2) - IE2 = dyn_cast<InsertElementInst>(IE2->getOperand(0)); - } while (IE1 || IE2); - return false; - }); - int VecId = -1; - if (It == FirstUsers.end()) { - VF.push_back(FTy->getNumElements()); - ShuffleMask.emplace_back(VF.back(), UndefMaskElem); - FirstUsers.push_back(EU.User); - DemandedElts.push_back(APInt::getZero(VF.back())); - VecId = FirstUsers.size() - 1; - } else { - VecId = std::distance(FirstUsers.begin(), It); - } - int Idx = *InsertIdx; - ShuffleMask[VecId][Idx] = EU.Lane; - DemandedElts[VecId].setBit(Idx); - } - } - - // If we plan to rewrite the tree in a smaller type, we will need to sign - // extend the extracted value back to the original type. Here, we account - // for the extract and the added cost of the sign extend if needed. - auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth); - auto *ScalarRoot = VectorizableTree[0]->Scalars[0]; - if (MinBWs.count(ScalarRoot)) { - auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first); - auto Extend = - MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt; - VecTy = FixedVectorType::get(MinTy, BundleWidth); - ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(), - VecTy, EU.Lane); - } else { - ExtractCost += - TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane); - } + if (TE.Idx && TE.Cost > 0) + ReduceableCost += TE.Cost; } + return CostSum; +} - InstructionCost SpillCost = getSpillCost(); - Cost += SpillCost + ExtractCost; - for (int I = 0, E = FirstUsers.size(); I < E; ++I) { - // For the very first element - simple shuffle of the source vector. - int Limit = ShuffleMask[I].size() * 2; - if (I == 0 && - all_of(ShuffleMask[I], [Limit](int Idx) { return Idx < Limit; }) && - !ShuffleVectorInst::isIdentityMask(ShuffleMask[I])) { - InstructionCost C = TTI->getShuffleCost( - TTI::SK_PermuteSingleSrc, - cast<FixedVectorType>(FirstUsers[I]->getType()), ShuffleMask[I]); - LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C - << " for final shuffle of insertelement external users " - << *VectorizableTree.front()->Scalars.front() << ".\n" - << "SLP: Current total cost = " << Cost << "\n"); - Cost += C; - continue; - } - // Other elements - permutation of 2 vectors (the initial one and the next - // Ith incoming vector). - unsigned VF = ShuffleMask[I].size(); - for (unsigned Idx = 0; Idx < VF; ++Idx) { - int &Mask = ShuffleMask[I][Idx]; - Mask = Mask == UndefMaskElem ? Idx : VF + Mask; - } - InstructionCost C = TTI->getShuffleCost( - TTI::SK_PermuteTwoSrc, cast<FixedVectorType>(FirstUsers[I]->getType()), - ShuffleMask[I]); - LLVM_DEBUG( - dbgs() - << "SLP: Adding cost " << C - << " for final shuffle of vector node and external insertelement users " - << *VectorizableTree.front()->Scalars.front() << ".\n" - << "SLP: Current total cost = " << Cost << "\n"); - Cost += C; - InstructionCost InsertCost = TTI->getScalarizationOverhead( - cast<FixedVectorType>(FirstUsers[I]->getType()), DemandedElts[I], - /*Insert*/ true, - /*Extract*/ false); - Cost -= InsertCost; - LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost - << " for insertelements gather.\n" - << "SLP: Current total cost = " << Cost << "\n"); +InstructionCost BoUpSLP::getTreeCost(bool TreeReduce, + ArrayRef<Value *> VectorizedVals) { + InstructionCost CostSum; + if (!IsCostSumReady) { + CostSum = getRawTreeCost(VectorizedVals); + RawTreeCost = CostSum; + } else { + CostSum = RawTreeCost; } + InstructionCost ExtractCost = getExtractShuffleCost(CostSum); + InstructionCost SpillCost = getSpillCost(); + InstructionCost InsertCost = getInsertCost(VectorizedVals); + InstructionCost Cost = CostSum + ExtractCost + SpillCost + InsertCost; + InstructionCost FullCost = Cost; + #ifndef NDEBUG SmallString<256> Str; { @@ -4612,7 +4927,77 @@ ViewGraph(this, "SLP" + F->getName(), false, Str); #endif - return Cost; + if (TreeReduce && Cost >= -SLPCostThreshold) { + std::vector<TreeEntry *> Vec; + SubTreeQueue SubTrees; + if (!findSubTree(SubTrees, Cost)) + return Cost; + unsigned NodeCounter = 0; + + for (std::unique_ptr<TreeEntry> &TEPtr : VectorizableTree) { + TreeEntry *Entry = TEPtr.get(); + Entry->dump(); + } + + while (!SubTrees.empty()) { + std::pair<int, std::vector<TreeEntry *>> SubTree = SubTrees.top(); + SubTrees.pop(); + NodeCounter++; + + bool NoConnectToVecOp = true; + for (TreeEntry *T : SubTree.second) { + ProposedToGather.insert(T); + T->State = TreeEntry::NeedToGather; + // Zero cost on any gather operations with no direct + // connection to vector nodes. At this point those nodes stop + // being gather nodes + for (TreeEntry *U : T->UseEntries) { + for (const EdgeInfo &EI : U->UserTreeIndices) { + if (EI.UserTE->State != TreeEntry::NeedToGather) { + NoConnectToVecOp = false; + break; + } + } + if (!NoConnectToVecOp) + break; + } + if (NoConnectToVecOp) + for (TreeEntry *U : T->UseEntries) { + CostSum -= U->Cost; + U->Cost = 0; + } + CostSum -= T->Cost; + T->Cost = getEntryCost(T, VectorizedVals); + CostSum += T->Cost; + + for (Value *V : T->Scalars) { + MustGather.insert(V); + ExternalUses.erase( + llvm::remove_if(ExternalUses, + [V](ExternalUser &EU) { return EU.Scalar == V; }), + ExternalUses.end()); + } + ExtractCost = getExtractShuffleCost(CostSum); + if (!NoCallInst) + SpillCost = getSpillCost(); + assert((!NoCallInst || getSpillCost() == 0) && "Incorrect spill cost"); + InstructionCost InsertCost = getInsertCost(VectorizedVals); + Cost = CostSum + ExtractCost + SpillCost + InsertCost; + for (std::unique_ptr<TreeEntry> &TEPtr : VectorizableTree) { + TreeEntry *Entry = TEPtr.get(); + Entry->dump(); + } + if (Cost < -SLPCostThreshold && !isTreeTinyAndNotFullyVectorizable() && + (VectorizableTree[0]->State != TreeEntry::NeedToGather && + VectorizableTree[1]->State != TreeEntry::NeedToGather)) { + cutTree(); + return Cost; + } + } + } + ProposedToGather.clear(); + } + return FullCost; } Optional<TargetTransformInfo::ShuffleKind> @@ -5666,12 +6051,24 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { // All blocks must be scheduled before any instructions are inserted. for (auto &BSIter : BlocksSchedules) { - scheduleBlock(BSIter.second.get()); + BlockScheduling *BS = BSIter.second.get(); + // Remove all Schedule Data from all nodes that we have changed + // vectorization decision. + if (!ProposedToGather.empty()) + removeFromScheduling(BS); + scheduleBlock(BS); } Builder.SetInsertPoint(&F->getEntryBlock().front()); auto *VectorRoot = vectorizeTree(VectorizableTree[0].get()); + for (std::unique_ptr<TreeEntry> &TEPtr : VectorizableTree) { + TreeEntry *Entry = TEPtr.get(); + if ((Entry->State != TreeEntry::NeedToGather) && + !Entry->VectorizedValue) + vectorizeTree(Entry); + } + // If the vectorized tree can be rewritten in a smaller type, we truncate the // vectorized root. InstCombine will then rewrite the entire expression. We // sign extend the extracted values below. @@ -5813,7 +6210,9 @@ #ifndef NDEBUG Type *Ty = Scalar->getType(); - if (!Ty->isVoidTy()) { + // The tree might not be fully vectorized, so we don't have to + // check every user. + if (!Ty->isVoidTy() && ProposedToGather.empty()) { for (User *U : Scalar->users()) { LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n"); @@ -6042,6 +6441,7 @@ BundleMember->FirstInBundle = BundleMember; ScheduleData *Next = BundleMember->NextInBundle; BundleMember->NextInBundle = nullptr; + BundleMember->TE = nullptr; BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps; if (BundleMember->UnscheduledDepsInBundle == 0) { ReadyInsts.insert(BundleMember); @@ -6311,6 +6711,85 @@ ReadyInsts.clear(); } +void BoUpSLP::BlockScheduling::reduceSchedulingRegion(Instruction *Start, + Instruction *End) { + if (Start) + ScheduleStart = Start; + if (End) + ScheduleEnd = End; +} + +void BoUpSLP::removeFromScheduling(BlockScheduling *BS) { + bool Removed = false; + SmallPtrSet<Instruction *, 12> Gathers; + SmallPtrSet<Instruction *, 12> Reduced; + Instruction *Start = nullptr; + + // We can reduce the number of instructions to be considered for scheduling, + // after cutting the tree. Here we shrink the scheduling area from the top, + // consecutively, untill we encounter the required instruction. There might be + // unnecessary NeedToGather nodes with the relationship only to other + // NeedToGather nodes and unmap instructions in chains, we could safely + // delete those. + for (std::unique_ptr<TreeEntry> &TEPtr : reverse(VectorizableTree)) { + TreeEntry *TE = TEPtr.get(); + if (TE->State != TreeEntry::NeedToGather || !TE->getOpcode() || + TE->getMainOp()->getParent() != BS->BB) + continue; + for (const EdgeInfo &EI : TE->UserTreeIndices) { + if (EI.UserTE && (EI.UserTE->State != TreeEntry::NeedToGather)) { + auto InstructionsOnly = + make_filter_range(TE->Scalars, Instruction::classof); + for (Value *V : InstructionsOnly) + Gathers.insert(cast<Instruction>(V)); + break; + } + } + } + + for (Instruction *I = BS->ScheduleStart; I != BS->ScheduleEnd; + I = I->getNextNode()) { + if (!getTreeEntry(I) && !Gathers.count(I)) { + Reduced.insert(I); + } else { + Start = I; + break; + } + } + + BS->reduceSchedulingRegion(Start, nullptr); + + for (TreeEntry *Entry : ProposedToGather) { + ScheduleData *SD = BS->getScheduleData(Entry->Scalars[0]); + if (SD && SD->isPartOfBundle()) { + if (!Removed) { + Removed = true; + BS->resetSchedule(); + } + SD->IsScheduled = false; + BS->cancelScheduling(Entry->Scalars, SD->OpValue); + } + } + if (!Removed) + return; + + if (Reduced.size()) { + for (Instruction *I : Reduced) { + ScheduleData *SD = BS->getScheduleData(I); + if (SD) + SD->SchedulingRegionID = -1; + } + } + BS->resetSchedule(); + BS->initialFillReadyList(BS->ReadyInsts); + for (Instruction *I = BS->ScheduleStart; I != BS->ScheduleEnd; + I = I->getNextNode()) { + if (BS->ScheduleDataMap.find(I) == BS->ScheduleDataMap.end()) + continue; + BS->doForAllOpcodes(I, [](ScheduleData *SD) { SD->clearDependencies(); }); + } +} + void BoUpSLP::scheduleBlock(BlockScheduling *BS) { if (!BS->ScheduleStart) return; @@ -6885,7 +7364,7 @@ R.computeMinimumValueSizes(); - InstructionCost Cost = R.getTreeCost(); + InstructionCost Cost = R.getTreeCost(true); LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF =" << VF << "\n"); if (Cost < -SLPCostThreshold) { @@ -7091,6 +7570,7 @@ // Check that all of the parts are instructions of the same type, // we permit an alternate opcode via InstructionsState. InstructionsState S = getSameOpcode(VL); + if (!S.getOpcode()) return false; @@ -7182,7 +7662,7 @@ continue; R.computeMinimumValueSizes(); - InstructionCost Cost = R.getTreeCost(); + InstructionCost Cost = R.getTreeCost(true); CandidateFound = true; MinCost = std::min(MinCost, Cost); @@ -7852,7 +8332,7 @@ // Estimate cost. InstructionCost TreeCost = - V.getTreeCost(makeArrayRef(&ReducedVals[i], ReduxWidth)); + V.getTreeCost(false, makeArrayRef(&ReducedVals[i], ReduxWidth)); InstructionCost ReductionCost = getReductionCost(TTI, ReducedVals[i], ReduxWidth, RdxFMF); InstructionCost Cost = TreeCost + ReductionCost; Index: llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll +++ llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll @@ -5,18 +5,20 @@ ; CHECK-LABEL: @rftbsub( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 2 -; CHECK-NEXT: [[TMP0:%.*]] = load double, double* [[ARRAYIDX6]], align 8 -; CHECK-NEXT: [[TMP1:%.*]] = or i64 2, 1 -; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP2:%.*]] = load double, double* [[ARRAYIDX12]], align 8 -; CHECK-NEXT: [[ADD16:%.*]] = fadd double [[TMP2]], undef +; CHECK-NEXT: [[TMP0:%.*]] = or i64 2, 1 +; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[ARRAYIDX6]] to <2 x double>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 1 +; CHECK-NEXT: [[ADD16:%.*]] = fadd double [[TMP3]], undef ; CHECK-NEXT: [[MUL18:%.*]] = fmul double undef, [[ADD16]] ; CHECK-NEXT: [[ADD19:%.*]] = fadd double undef, [[MUL18]] ; CHECK-NEXT: [[SUB22:%.*]] = fsub double undef, undef -; CHECK-NEXT: [[SUB25:%.*]] = fsub double [[TMP0]], [[ADD19]] -; CHECK-NEXT: store double [[SUB25]], double* [[ARRAYIDX6]], align 8 -; CHECK-NEXT: [[SUB29:%.*]] = fsub double [[TMP2]], [[SUB22]] -; CHECK-NEXT: store double [[SUB29]], double* [[ARRAYIDX12]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[ADD19]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[SUB22]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[ARRAYIDX6]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP6]], <2 x double>* [[TMP7]], align 8 ; CHECK-NEXT: unreachable ; entry: