Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -114,6 +114,15 @@ ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions")); +static cl::opt +SLPThrottling("slp-throttling", cl::init(false), cl::Hidden, + cl::desc("Enable tree partial vectorize with throttling")); + +static cl::opt +SLPMaxCostRecalc("slp-throttling-budget", cl::init(500), cl::Hidden, + cl::desc("Limit the total number of nodes for cost " + "recalculations during throttling")); + static cl::opt ShouldStartVectorizeHorAtStore( "slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc( @@ -524,11 +533,36 @@ /// \returns the cost incurred by unwanted spills and fills, caused by /// holding live values over call sites. - int getSpillCost(); + int getSpillCost(const SmallPtrSetImpl &ScalarsToVec); + + /// \returns the cost extracting vectorized elements. + int getExtractCost(const SmallPtrSetImpl &ScalarsToVec); + + /// \returns the cost of gathering canceled elements to be used + /// by vectorized operations during throtelling. + int getInsertCost(const SmallPtrSetImpl &VecToScalars, + unsigned Height); + + /// Determine the cost of partail vectorization. + int getSubTreeCost(ArrayRef VecNodes, ArrayRef EntriesCosts); + + /// Does a BFS for non-gathering nodes of SLP graph. + void BFS(); + + /// Determine structure of the graph. + void treeTraversal(); + + /// Find the subgraph of the graph that is profitable to vectorize. + int findSubTree(ArrayRef EntriesCosts); + + /// Try to extend the subgraph that makes sense to vectorize from + /// a cost perspective. + int addLeafs(unsigned LeafIdx, ArrayRef VecNodes, + ArrayRef EntriesCosts); /// \returns the vectorization cost of the subtree that starts at \p VL. /// A negative number means that this is profitable. - int getTreeCost(); + int getTreeCost(bool cutTree = false); /// Construct a vectorizable tree that starts at \p Roots, ignoring users for /// the purpose of scheduling and extraction in the \p UserIgnoreLst. @@ -549,6 +583,8 @@ ScalarToTreeEntry.clear(); MustGather.clear(); ExternalUses.clear(); + InternalTreeUses.clear(); + RemovedOperations.clear(); NumOpsWantToKeepOrder.clear(); NumOpsWantToKeepOriginalOrder = 0; for (auto &Iter : BlocksSchedules) { @@ -556,6 +592,11 @@ BS->clear(); } MinBWs.clear(); + UseTreeIndices.clear(); + Nodes.clear(); + NodesList.clear(); + LeafsList.clear(); + SLPCostRecalc = 0; } unsigned getTreeSize() const { return VectorizableTree.size(); } @@ -599,6 +640,9 @@ return MinVecRegSize; } + /// Save seed instructions to try partially vectorize later. + void recordSeeds(ArrayRef Ops); + /// Check if ArrayType or StructType is isomorphic to some VectorType. /// /// \returns number of elements in vector if isomorphism exists, 0 otherwise. @@ -608,6 +652,12 @@ /// vectorizable. We do not vectorize such trees. bool isTreeTinyAndNotFullyVectorizable() const; + /// Cut the tree to make it partially vectorizable. + void cutTree(ArrayRef VecNodes); + + /// Try partially vectorize the tree via throtteling. + bool tryPartialVectorization(); + OptimizationRemarkEmitter *getORE() { return ORE; } /// This structure holds any data we need about the edges being traversed @@ -1308,6 +1358,9 @@ /// Maps a specific scalar to its tree entry. SmallDenseMap ScalarToTreeEntry; + /// Tree entries that should not be vectorized due to throttling. + SmallVector RemovedOperations; + /// A list of scalars that we found that we need to keep as scalars. ValueSet MustGather; @@ -1327,6 +1380,44 @@ }; using UserList = SmallVector; + struct Leaf { + + /// Index of leaf in the tree. + unsigned Idx = 0; + + Leaf(unsigned I) : Idx(I) {} + + /// Full path to this leaf with tree nodes. + SmallVector Path; + }; + + struct Node { + + /// Index to this tree node. + unsigned Idx = 0; + + /// A previous node along the way to this node. + unsigned Prev = 0; + + /// Color of this node. + unsigned Color = 0; + + Node(unsigned I) : Idx(I) {} + + /// Is this node a branch in the tree. + bool IsBranch = false; + + /// If this is a branch, list of all leafs reachable + /// from here. + SmallVector LeafsForBranch; + + /// Non null if this node is a leaf. + BoUpSLP::Leaf *Leaf = nullptr; + }; + + /// \returns the cost extracting vectorized element. + int getExtractOperationCost(ExternalUser &EU); + /// Checks if two instructions may access the same memory. /// /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it @@ -1378,6 +1469,26 @@ /// after vectorization. UserList ExternalUses; + /// Array of leafs in non-gathering nodes graph. + SmallVector, 2> LeafsList; + + /// Array of non-gathering nodes of graph. + SmallVector, 8> NodesList; + + /// Maps non-gathering nodes. + DenseMap Nodes; + + /// List of all seeds instructions that were not vectorized, that + /// we will try to vectorize with patial vectorization. + SmallVector>, 8> Seeds; + + /// Number of times in nodes that we already recalulated cost of + /// the subgraph during throtteling. + int SLPCostRecalc = 0; + + /// Internal tree proposed to vectorized values use in that tree. + SmallDenseMap InternalTreeUses; + /// Values used only by @llvm.assume calls. SmallPtrSet EphValues; @@ -1387,6 +1498,9 @@ /// A list of blocks that we are going to CSE. SetVector CSEBlocks; + /// Relations of non-gathering graph nodes. + SmallDenseMap> UseTreeIndices; + /// Contains all scheduling relevant data for an instruction. /// A ScheduleData either represents a single instruction or a member of an /// instruction bundle (= a group of instructions which is combined into a @@ -1736,6 +1850,9 @@ /// Attaches the BlockScheduling structures to basic blocks. MapVector> BlocksSchedules; + /// Remove operations from the list of proposed to schedule. + void removeFromScheduling(BlockScheduling *BS); + /// Performs the "real" scheduling. Done before vectorization is actually /// performed in a basic block. void scheduleBlock(BlockScheduling *BS); @@ -1941,6 +2058,7 @@ LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U << ".\n"); assert(!UseEntry->NeedToGather && "Bad state"); + InternalTreeUses[U].push_back(ExternalUser(Scalar, U, FoundLane)); continue; } } @@ -2559,6 +2677,90 @@ } } +void BoUpSLP::cutTree(ArrayRef VecNodes) { + SmallPtrSet Removed; + // Canceling unprofitable elements. + for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) { + TreeEntry *Entry = &VectorizableTree[I]; + if (Entry->NeedToGather) + continue; + if (!is_contained(VecNodes, I)) { + Entry->NeedToGather = true; + for (Value *V : Entry->Scalars) { + LLVM_DEBUG(dbgs() << "SLP: Remove scalar " << *V + << " out of proposed to vectorize.\n"); + ScalarToTreeEntry.erase(V); + Removed.insert(V); + RemovedOperations.push_back(I); + MustGather.insert(V); + ExternalUses.erase( + std::remove_if(ExternalUses.begin(), ExternalUses.end(), + [&](ExternalUser &EU) { return EU.Scalar == V; }), + ExternalUses.end()); + } + } + } + // For all canceled operations we should consider the possibility of + // use by with non-canceled operations and for that, it requires + // to populate ExternalUser list with canceled elements. + for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) { + TreeEntry *Entry = &VectorizableTree[I]; + if (Entry->NeedToGather) + continue; + if (is_contained(VecNodes, I)) { + for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { + Value *Scalar = Entry->Scalars[Lane]; + for (User *U : Scalar->users()) { + LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n"); + Instruction *UserInst = dyn_cast(U); + if (!UserInst) + continue; + if (!Removed.count(U)) + continue; + // Ignore users in the user ignore list. + if (is_contained(UserIgnoreList, UserInst)) + continue; + LLVM_DEBUG(dbgs() << "SLP: Need to extract canceled operation :" + << *U << " from lane " << Lane + << " from " << *Scalar << ".\n"); + ExternalUses.push_back(ExternalUser(Scalar, U, Lane)); + } + } + } + } +} + +bool BoUpSLP::tryPartialVectorization() { + bool Changed = false; + for (unsigned I = 0; I < Seeds.size(); ++I) { + auto &S = *Seeds[I].get(); + // Check that seed instructions are still alive. + if (std::any_of(S.begin(), S.end(), [](Value *V) { + return (!(cast(V))->getOperand(0)); + })) + continue; + + buildTree(S); + + // If other part BB were vectorized the tree might not be + // enough interest to look. + if (isTreeTinyAndNotFullyVectorizable()) + continue; + + int Cost = getTreeCost(true); + if (Cost < -SLPCostThreshold) { + vectorizeTree(); + Changed = true; + } + } + Seeds.clear(); + return Changed; +} + +void BoUpSLP::recordSeeds(ArrayRef Ops) { + Seeds.push_back(llvm::make_unique>(Ops.begin(), Ops.end())); +} + unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const { unsigned N; Type *EltTy; @@ -3066,7 +3268,7 @@ return true; } -int BoUpSLP::getSpillCost() { +int BoUpSLP::getSpillCost(const SmallPtrSetImpl &ScalarsToVec) { // Walk from the bottom of the tree to the top, tracking which values are // live. When we see a call instruction that is not part of our tree, // query TTI to see if there is a cost to keeping values live over it @@ -3090,7 +3292,7 @@ // Update LiveValues. LiveValues.erase(PrevInst); for (auto &J : PrevInst->operands()) { - if (isa(&*J) && getTreeEntry(&*J)) + if (isa(&*J) && ScalarsToVec.count(&*J)) LiveValues.insert(cast(&*J)); } @@ -3131,15 +3333,315 @@ return Cost; } -int BoUpSLP::getTreeCost() { +int BoUpSLP::getExtractOperationCost(ExternalUser &EU) { + unsigned BundleWidth = VectorizableTree.front().Scalars.size(); + + // Uses by ephemeral values are free (because the ephemeral value will be + // removed prior to code generation, and so the extraction will be + // removed as well). + if (EphValues.count(EU.User)) + return 0; + + // If we plan to rewrite the tree in a smaller type, we will need to sign + // extend the extracted value back to the original type. Here, we account + // for the extract and the added cost of the sign extend if needed. + auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth); + auto *ScalarRoot = VectorizableTree[0].Scalars[0]; + + if (MinBWs.count(ScalarRoot)) { + auto *MinTy = IntegerType::get(F->getContext(), + MinBWs[ScalarRoot].first); + auto Extend = + MinBWs[ScalarRoot].second ? + Instruction::SExt : Instruction::ZExt; + VecTy = VectorType::get(MinTy, BundleWidth); + return (TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(), + VecTy, EU.Lane)); + } + return TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane); +} + +int BoUpSLP::getExtractCost(const SmallPtrSetImpl &ScalarsToVec) { + int ExtractCost = 0; + SmallPtrSet ExtractCostCalculated; + for (ExternalUser &EU : ExternalUses) { + // We only add extract cost once for the same scalar. + if (!ExtractCostCalculated.insert(EU.Scalar).second) + continue; + + // Avoid non-vectorized scalars. + if (!ScalarsToVec.count(EU.Scalar)) { + // Consider the possibility of extracting vectorized + // values for canceled elements use. + if (InternalTreeUses.find(EU.Scalar) != InternalTreeUses.end()) + for (ExternalUser &IU : InternalTreeUses[EU.Scalar]) + ExtractCost += getExtractOperationCost(IU); + continue; + } + + ExtractCost += getExtractOperationCost(EU); + } + return ExtractCost; +} + +int BoUpSLP::getInsertCost(const SmallPtrSetImpl &VecToScalars, + unsigned Height) { + int InsertCost = 0; + for (int I = Height; I >= 0; I--) { + TreeEntry *Entry = &VectorizableTree[I]; + if (Entry->NeedToGather) + continue; + for (Value *V : Entry->Scalars) { + auto *Inst = cast(V); + for (Use &U : Inst->operands()) { + Value *Op = U.get(); + if (VecToScalars.count(Op)) + InsertCost += getGatherCost(Op); + } + } + } + return InsertCost; +} + +int BoUpSLP::getSubTreeCost(ArrayRef VecNodes, + ArrayRef EntriesCosts) { + int Cost = 0; + int ExtractCost = 0; + int SpillCost = 0; + SmallPtrSet ScalarsToVec; + SmallPtrSet VecToScalars; + +#ifndef NDEBUG + for (unsigned N : VecNodes) { + if (N == 0) + continue; + unsigned C = N; + do { + C = Nodes[C]->Prev; + assert(is_contained(VecNodes, C) && + "Incorrect gap in the tree structure"); + } while (C != 0); + } +#endif + + SLPCostRecalc += VectorizableTree.size(); + + for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) { + TreeEntry *Entry = &VectorizableTree[I]; + if (Entry->NeedToGather) + continue; + if (is_contained(VecNodes, I)) { + for (Value *V : Entry->Scalars) + ScalarsToVec.insert(V); + int GatherCost = 0; + auto It = UseTreeIndices.find(I); + if (It != UseTreeIndices.end()) + for (int Gather : It->second) + if (VectorizableTree[Gather].NeedToGather) + GatherCost += EntriesCosts[Gather]; + Cost += EntriesCosts[I] + GatherCost; + } else { + for (Value *V : Entry->Scalars) + VecToScalars.insert(V); + } + } + + int Inserts = getInsertCost(VecToScalars, VectorizableTree.size() - 1); + Cost += Inserts; + ExtractCost = getExtractCost(ScalarsToVec); + Cost += ExtractCost; + SpillCost = getSpillCost(ScalarsToVec); + Cost += SpillCost; + + return Cost; +} + +void BoUpSLP::BFS() { + std::vector Worklist; + Worklist.push_back(0); + do { + unsigned Current = Worklist.back(); + Worklist.pop_back(); + bool isLeaf = true; + int BranchCounter = 0; + Nodes[0]->Color = 1; + for (unsigned Next : UseTreeIndices[Current]) { + if (Next == Current) + continue; + if (VectorizableTree[Next].NeedToGather) + continue; + BranchCounter++; + isLeaf = false; + if (BranchCounter > 1) { + Nodes[Current]->IsBranch = true; + } + // FIXME: To simplify the algorith, consider + // the top node of a cycle as a leaf. + if (Nodes[Next]->Color != 0) { + isLeaf = true; + } else { + Worklist.push_back(Next); + Nodes[Next]->Color = 1; + Nodes[Next]->Prev = Current; + } + } + if (isLeaf && Current) { + LeafsList.push_back(llvm::make_unique(Current)); + Nodes[Current]->Leaf = LeafsList.back().get(); + } + } while (!Worklist.empty()); +} + +void BoUpSLP::treeTraversal() { + assert(NodesList.size() == 0 && "Bad state"); + for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) { + if (VectorizableTree[I].NeedToGather) + continue; + NodesList.push_back(llvm::make_unique(I)); + Nodes[I] = NodesList.back().get(); + assert(I == Nodes[I]->Idx && "Incorrect node index"); + } + + BFS(); + + // Construct a path to every leaf. + for (auto &L : LeafsList) { + auto *Leaf = L.get(); + assert(Nodes[Leaf->Idx]->Leaf && "This suppose to be a leaf node"); + unsigned Current = Leaf->Idx; + do { + Leaf->Path.push_back(Current); + if (Nodes[Current]->IsBranch) + Nodes[Current]->LeafsForBranch.push_back(Leaf->Idx); + Current = Nodes[Current]->Prev; + } while (Current != 0); + Leaf->Path.push_back(0); + std::reverse(Leaf->Path.begin(), Leaf->Path.end()); + } +} + +int BoUpSLP::addLeafs(unsigned LeafIdx, ArrayRef VecNodes, + ArrayRef EntriesCosts) { int Cost = 0; - LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size " - << VectorizableTree.size() << ".\n"); + SmallVector Vecs; + SmallVector Path; + for (unsigned V : VecNodes) { + Vecs.push_back(V); + Path.push_back(V); + } + Leaf *LeafNode = Nodes[LeafIdx]->Leaf; + assert(LeafNode && "This suppose to be a leaf node"); - unsigned BundleWidth = VectorizableTree[0].Scalars.size(); + // Add leafs from the graph along the way to the bottom of + // the graph if profitable. + do { + unsigned Idx = Path.back(); + if (Nodes[Idx]->IsBranch) { + for (unsigned L : Nodes[Idx]->LeafsForBranch) { + if (Nodes[L]->Color == LeafIdx) + continue; + Leaf *Leaf = Nodes[L]->Leaf; + assert(Leaf && "This suppose to be a leaf node"); + + SmallVector SubPath(Leaf->Path); + SmallVector PathNodes; + + do { + unsigned Item = SubPath.back(); + if (Nodes[Item]->Color == LeafIdx) { + for (unsigned I = PathNodes.size(); I--;) { + SmallVector NewVecs(Vecs); + // Add node by node to the tree, the whole + // remaining path to leaf might not be profitable. + for (unsigned J = I; JColor = LeafIdx; + } + break; + } + } + break; + } + PathNodes.push_back(Item); + SubPath.pop_back(); + } while (SubPath.size() > 0); + } + } + Path.pop_back(); + } while (Path.size() > 0); + + if (Cost < -SLPCostThreshold) { + assert(Vecs.size() < NodesList.size() && + "Incorrect number to vectorize nodes"); + LLVM_DEBUG(dbgs() + << "SLP: Reduced the number of elements to " << Vecs.size() + << " in the tree with " << Cost + << " cost to make it partially vectorizable.\n"); + cutTree(Vecs); + return Cost; + } + return INT_MAX; +} +int BoUpSLP::findSubTree(ArrayRef EntriesCosts) { + SmallVector Leafs; + SmallDenseMap CostToVec; + + treeTraversal(); + for (auto &L : LeafsList) + Leafs.push_back(L.get()); + + std::sort(Leafs.begin(), Leafs.end(), + [](Leaf *A, Leaf *B) { return A->Path.size() > B->Path.size(); }); + + // Walk by graph leafs and reduce branch height + // to find any profitable node. + for (auto *Leaf : Leafs) { + SmallVector Path(Leaf->Path); + do { + unsigned Idx = Path.back(); + if (CostToVec[Idx] || SLPMaxCostRecalc < SLPCostRecalc) + break; + CostToVec[Idx] = getSubTreeCost(Path, EntriesCosts); + if (CostToVec[Idx] < -SLPCostThreshold) { + for (unsigned Idx : Path) + Nodes[Idx]->Color = Leaf->Idx; + int CostWithLeafs = addLeafs(Leaf->Idx, Path, EntriesCosts); + if (CostWithLeafs == INT_MAX) { + LLVM_DEBUG(dbgs() + << "SLP: Reduced the number of elements to " << Path.size() + << " in the tree with " << CostToVec[Idx] + << " cost to make it partially vectorizable.\n"); + cutTree(Path); + return CostToVec[Idx]; + } + return CostWithLeafs; + } + Path.pop_back(); + } while (Path.size() > 0); + } + return INT_MAX; +} + +int BoUpSLP::getTreeCost(bool cutTree) { + SmallPtrSet ScalarsToVec; + SmallPtrSet VecToScalars; + SmallVector EntriesCosts; + int CostSum = 0; + LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size " + << VectorizableTree.size() << ".\n"); for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) { TreeEntry &TE = VectorizableTree[I]; + EntriesCosts.push_back(0); // We create duplicate tree entries for gather sequences that have multiple // uses. However, we should not compute the cost of duplicate sequences. @@ -3160,56 +3662,47 @@ })) continue; - int C = getEntryCost(&TE); - LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C + for (auto &UseTreeIdx : TE.UserTreeIndices) + UseTreeIndices[UseTreeIdx.Idx].push_back(I); + + if (!TE.NeedToGather) { + for (Value *V : TE.Scalars) + ScalarsToVec.insert(V); + } + + EntriesCosts[I] = getEntryCost(&TE); + LLVM_DEBUG(dbgs() << "SLP: Adding cost " << EntriesCosts[I] << " for bundle that starts with " << *TE.Scalars[0] << ".\n"); - Cost += C; + + CostSum += EntriesCosts[I]; } - SmallPtrSet ExtractCostCalculated; int ExtractCost = 0; - for (ExternalUser &EU : ExternalUses) { - // We only add extract cost once for the same scalar. - if (!ExtractCostCalculated.insert(EU.Scalar).second) - continue; + int SpillCost = 0; + int Cost = CostSum; - // Uses by ephemeral values are free (because the ephemeral value will be - // removed prior to code generation, and so the extraction will be - // removed as well). - if (EphValues.count(EU.User)) - continue; - - // If we plan to rewrite the tree in a smaller type, we will need to sign - // extend the extracted value back to the original type. Here, we account - // for the extract and the added cost of the sign extend if needed. - auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth); - auto *ScalarRoot = VectorizableTree[0].Scalars[0]; - if (MinBWs.count(ScalarRoot)) { - auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first); - auto Extend = - MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt; - VecTy = VectorType::get(MinTy, BundleWidth); - ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(), - VecTy, EU.Lane); - } else { - ExtractCost += - TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane); - } - } - - int SpillCost = getSpillCost(); - Cost += SpillCost + ExtractCost; + ExtractCost = getExtractCost(ScalarsToVec); + Cost += ExtractCost; + SpillCost = getSpillCost(ScalarsToVec); + Cost += SpillCost; std::string Str; - { - raw_string_ostream OS(Str); - OS << "SLP: Spill Cost = " << SpillCost << ".\n" - << "SLP: Extract Cost = " << ExtractCost << ".\n" - << "SLP: Total Cost = " << Cost << ".\n"; + if (SLPThrottling && cutTree && Cost >= -SLPCostThreshold) { + Cost = findSubTree(EntriesCosts); + { + raw_string_ostream OS(Str); + OS << "SLP: Partial vectorization with Total Cost = " << Cost << ".\n"; + } + } else { + { + raw_string_ostream OS(Str); + OS << "SLP: Spill Cost = " << SpillCost << ".\n" + << "SLP: Extract Cost = " << ExtractCost << ".\n" + << "SLP: Total Cost = " << Cost << ".\n"; + } } LLVM_DEBUG(dbgs() << Str); - if (ViewSLPTree) ViewGraph(this, "SLP" + F->getName(), false, Str); @@ -3928,7 +4421,12 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { // All blocks must be scheduled before any instructions are inserted. for (auto &BSIter : BlocksSchedules) { - scheduleBlock(BSIter.second.get()); + BlockScheduling *BS = BSIter.second.get(); + // Remove all Schedule Data from all nodes that we have changed + // vectorization decision. + if (RemovedOperations.size()) + removeFromScheduling(BS); + scheduleBlock(BS); } Builder.SetInsertPoint(&F->getEntryBlock().front()); @@ -4057,13 +4555,16 @@ Type *Ty = Scalar->getType(); if (!Ty->isVoidTy()) { #ifndef NDEBUG - for (User *U : Scalar->users()) { - LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n"); - - // It is legal to replace users in the ignorelist by undef. - assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) && - "Replacing out-of-tree value with undef"); - } + // The tree might not be fully vectorized, so we don't have to + // check every user. + if (!RemovedOperations.size()) + for (User *U : Scalar->users()) { + LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n"); + + // It is legal to replace users in the ignorelist by undef. + assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) && + "Replacing out-of-tree value with undef"); + } #endif Value *Undef = UndefValue::get(Ty); Scalar->replaceAllUsesWith(Undef); @@ -4540,6 +5041,32 @@ ReadyInsts.clear(); } +void BoUpSLP::removeFromScheduling(BlockScheduling *BS) { + bool Removed = false; + for (int I : RemovedOperations) { + TreeEntry *Entry = &VectorizableTree[I]; + ScheduleData *SD = BS->getScheduleData(Entry->Scalars[0]); + if (SD && SD->isPartOfBundle()) { + if (!Removed) { + Removed = true; + BS->resetSchedule(); + } + BS->cancelScheduling(Entry->Scalars, SD->OpValue); + } + } + if (!Removed) + return; + BS->resetSchedule(); + BS->initialFillReadyList(BS->ReadyInsts); + for (Instruction *I = BS->ScheduleStart; I != BS->ScheduleEnd; + I = I->getNextNode()) { + if (BS->ScheduleDataMap.find(I) == BS->ScheduleDataMap.end()) + continue; + BS->doForAllOpcodes(I, + [&](ScheduleData *SD) { SD->clearDependencies(); }); + } +} + void BoUpSLP::scheduleBlock(BlockScheduling *BS) { if (!BS->ScheduleStart) return; @@ -5008,6 +5535,9 @@ << " underlying objects.\n"); Changed |= vectorizeGEPIndices(BB, R); } + + if (SLPThrottling) + Changed |= R.tryPartialVectorization(); } if (Changed) { @@ -5083,6 +5613,8 @@ // Move to the next bundle. i += VF - 1; Changed = true; + } else { + R.recordSeeds(Operands); } } @@ -5312,6 +5844,8 @@ I += VF - 1; NextInst = I + 1; Changed = true; + } else { + R.recordSeeds(Ops); } } } @@ -6122,7 +6656,7 @@ V.computeMinimumValueSizes(); // Estimate cost. - int TreeCost = V.getTreeCost(); + int TreeCost = V.getTreeCost(SLPThrottling); int ReductionCost = getReductionCost(TTI, ReducedVals[i], ReduxWidth); int Cost = TreeCost + ReductionCost; if (Cost >= -SLPCostThreshold) { Index: test/Transforms/SLPVectorizer/X86/slp-throttle.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/slp-throttle.ll +++ test/Transforms/SLPVectorizer/X86/slp-throttle.ll @@ -1,22 +1,24 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck %s +; RUN: opt -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 -slp-throttling < %s | FileCheck %s define dso_local void @rftbsub(double* %a) local_unnamed_addr #0 { ; CHECK-LABEL: @rftbsub( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 2 -; CHECK-NEXT: [[TMP0:%.*]] = load double, double* [[ARRAYIDX6]], align 8 -; CHECK-NEXT: [[TMP1:%.*]] = or i64 2, 1 -; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP2:%.*]] = load double, double* [[ARRAYIDX12]], align 8 -; CHECK-NEXT: [[ADD16:%.*]] = fadd double [[TMP2]], undef +; CHECK-NEXT: [[TMP0:%.*]] = or i64 2, 1 +; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[ARRAYIDX6]] to <2 x double>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 1 +; CHECK-NEXT: [[ADD16:%.*]] = fadd double [[TMP3]], undef ; CHECK-NEXT: [[MUL18:%.*]] = fmul double undef, [[ADD16]] ; CHECK-NEXT: [[ADD19:%.*]] = fadd double undef, [[MUL18]] ; CHECK-NEXT: [[SUB22:%.*]] = fsub double undef, undef -; CHECK-NEXT: [[SUB25:%.*]] = fsub double [[TMP0]], [[ADD19]] -; CHECK-NEXT: store double [[SUB25]], double* [[ARRAYIDX6]], align 8 -; CHECK-NEXT: [[SUB29:%.*]] = fsub double [[TMP2]], [[SUB22]] -; CHECK-NEXT: store double [[SUB29]], double* [[ARRAYIDX12]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> undef, double [[ADD19]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[SUB22]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[ARRAYIDX6]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP6]], <2 x double>* [[TMP7]], align 8 ; CHECK-NEXT: unreachable ; entry: