Index: llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -29,6 +29,7 @@ #include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallString.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/iterator.h" @@ -118,8 +119,17 @@ "number ")); static cl::opt -ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, - cl::desc("Attempt to vectorize horizontal reductions")); + ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, + cl::desc("Attempt to vectorize horizontal reductions")); + +static cl::opt + SLPThrottling("slp-throttle", cl::init(true), cl::Hidden, + cl::desc("Enable tree partial vectorize with throttling")); + +static cl::opt + MaxCostsRecalculations("slp-throttling-budget", cl::init(32), cl::Hidden, + cl::desc("Limit the total number of nodes for cost " + "recalculations during throttling")); static cl::opt ShouldStartVectorizeHorAtStore( "slp-vectorize-hor-store", cl::init(false), cl::Hidden, @@ -571,7 +581,48 @@ /// \returns the cost incurred by unwanted spills and fills, caused by /// holding live values over call sites. - int getSpillCost() const; + int getSpillCost(); + + /// \returns the cost extracting vectorized elements. + int getExtractCost() const; + + /// \returns the cost of gathering canceled elements to be used + /// by vectorized operations during throttling. + int getInsertCost(); + + /// Find a subtree of the whole tree suitable to be vectorized. When + /// vectorizing the whole tree is not profitable, we can consider vectorizing + /// part of that tree. SLP algorithm looks to operations to vectorize starting + /// from seed instructions on the bottom toward the end of chains of + /// dependencies to the top of SLP graph, it groups potentially vectorizable + /// operations in scalar form to bundles. + /// For example: + /// + /// scalar form + /// | + /// scalar form scalar form + /// \ / + /// scalar form + /// + /// Total cost is not profitable to vectorize, hence all operations are in + /// scalar form. + /// + /// Here is the same tree after SLP throttling transformation: + /// + /// vector form + /// | + /// vector form scalar form + /// \ / + /// vector form + /// + /// So, we can throttle some operations in such a way that it is still + /// profitable to vectorize part on the tree, while all tree vectorization + /// does not make sense. + /// More details: http://www.llvm.org/devmtg/2015-10/slides/Porpodas-ThrottlingAutomaticVectorization.pdf + bool findSubTree(int UserCost = 0); + + /// Get raw summary of all elements of the tree. + int getRawTreeCost(); /// \returns the vectorization cost of the subtree that starts at \p VL. /// A negative number means that this is profitable. @@ -596,6 +647,8 @@ ScalarToTreeEntry.clear(); MustGather.clear(); ExternalUses.clear(); + InternalTreeUses.clear(); + RemovedOperations.clear(); NumOpsWantToKeepOrder.clear(); NumOpsWantToKeepOriginalOrder = 0; for (auto &Iter : BlocksSchedules) { @@ -603,6 +656,12 @@ BS->clear(); } MinBWs.clear(); + ScalarsToVec.clear(); + VecToScalars.clear(); + VecInserts.clear(); + NoCallInst = true; + RawTreeCost = 0; + IsCostSumReady = false; } unsigned getTreeSize() const { return VectorizableTree.size(); } @@ -667,6 +726,9 @@ /// may not be necessary. bool isLoadCombineReductionCandidate(unsigned ReductionOpcode) const; + /// Try to cut the tree to make it partially vectorizable. + bool cutTree(); + OptimizationRemarkEmitter *getORE() { return ORE; } /// This structure holds any data we need about the edges being traversed @@ -1448,7 +1510,7 @@ Value *VectorizedValue = nullptr; /// Do we need to gather this sequence ? - enum EntryState { Vectorize, NeedToGather }; + enum EntryState { Vectorize, NeedToGather, ProposedToGather }; EntryState State; /// Does this sequence require some shuffling? @@ -1457,6 +1519,9 @@ /// Does this entry require reordering? ArrayRef ReorderIndices; + /// Cost of this tree entry. + int Cost = 0; + /// Points back to the VectorizableTree. /// /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has @@ -1469,6 +1534,9 @@ /// have multiple users so the data structure is not truly a tree. SmallVector UserTreeIndices; + /// Use of this entry. + TinyPtrVector UseEntries; + /// The index of this treeEntry in VectorizableTree. int Idx = -1; @@ -1599,6 +1667,9 @@ case NeedToGather: dbgs() << "NeedToGather\n"; break; + case ProposedToGather: + dbgs() << "ProposedToGather\n"; + break; } dbgs() << "MainOp: "; if (MainOp) @@ -1669,8 +1740,10 @@ MustGather.insert(VL.begin(), VL.end()); } - if (UserTreeIdx.UserTE) + if (UserTreeIdx.UserTE) { Last->UserTreeIndices.push_back(UserTreeIdx); + VectorizableTree[UserTreeIdx.UserTE->Idx]->UseEntries.push_back(Last); + } return Last; } @@ -1706,9 +1779,32 @@ /// Maps a specific scalar to its tree entry. SmallDenseMap ScalarToTreeEntry; + /// Tree entries that should not be vectorized due to throttling. + SmallVector RemovedOperations; + + /// Tree values proposed to be vectorized. + ValueSet ScalarsToVec; + + /// Tree values once considered to be vectorized, but later with throttling + /// decided to stay in a scalar form. + ValueSet VecToScalars; + /// A list of scalars that we found that we need to keep as scalars. ValueSet MustGather; + /// Total cost of inserts in the tree for a particular value. + SmallDenseMap VecInserts; + + /// Raw cost of all elemts in the tree. + int RawTreeCost = 0; + + /// Indicate that no CallInst found in the tree and we don't need to calculate + /// spill cost. + bool NoCallInst = true; + + /// True, if we have calucalte tree cost for the tree. + bool IsCostSumReady = false; + /// This POD struct describes one external user in the vectorized tree. struct ExternalUser { ExternalUser(Value *S, llvm::User *U, int L) @@ -1725,6 +1821,9 @@ }; using UserList = SmallVector; + /// \returns the cost of extracting the vectorized elements. + int getExtractOperationCost(const ExternalUser &EU) const; + /// Checks if two instructions may access the same memory. /// /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it @@ -1775,6 +1874,12 @@ /// after vectorization. UserList ExternalUses; + /// Current operations width to vectorize. + unsigned BundleWidth = 0; + + /// Internal tree oprations proposed to be vectorized values use. + SmallDenseMap InternalTreeUses; + /// Values used only by @llvm.assume calls. SmallPtrSet EphValues; @@ -2175,6 +2280,9 @@ /// Attaches the BlockScheduling structures to basic blocks. MapVector> BlocksSchedules; + /// Remove operations from the list of proposed to schedule. + void removeFromScheduling(BlockScheduling *BS); + /// Performs the "real" scheduling. Done before vectorization is actually /// performed in a basic block. void scheduleBlock(BlockScheduling *BS); @@ -2378,7 +2486,7 @@ buildTree_rec(Roots, 0, EdgeInfo()); // Collect the values that we need to extract from the tree. - for (auto &TEPtr : VectorizableTree) { + for (std::unique_ptr &TEPtr : VectorizableTree) { TreeEntry *Entry = TEPtr.get(); // No need to handle users of gathered values. @@ -2420,6 +2528,7 @@ LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U << ".\n"); assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state"); + InternalTreeUses[U].emplace_back(Scalar, U, FoundLane); continue; } } @@ -3127,6 +3236,68 @@ } } +bool BoUpSLP::cutTree() { + SmallVector VecNodes; + + // Estimate the subtree not just from a cost perspective, but functional. + bool FoundRealOp = false; + for (const std::unique_ptr &TEPtr : VectorizableTree) { + TreeEntry *Entry = TEPtr.get(); + if (Entry->State != TreeEntry::Vectorize) + continue; + Instruction *Inst = Entry->getMainOp(); + if (Inst && (isa(Inst) || isa(Inst) || + isa(Inst))) { + FoundRealOp = true; + break; + } + } + if (!FoundRealOp) + return false; + + for (std::unique_ptr &TEPtr : VectorizableTree) { + TreeEntry *Entry = TEPtr.get(); + if (Entry->State == TreeEntry::Vectorize) + VecNodes.push_back(Entry); + } + if (VecNodes.size() <= 2) + return false; + // Canceling unprofitable elements. + for (std::unique_ptr &TEPtr : VectorizableTree) { + TreeEntry *Entry = TEPtr.get(); + if (Entry->State == TreeEntry::NeedToGather) + continue; + if (Entry->State == TreeEntry::ProposedToGather) { + Entry->State = TreeEntry::NeedToGather; + for (Value *V : Entry->Scalars) { + LLVM_DEBUG(dbgs() << "SLP: Remove scalar " << *V + << " out of proposed to vectorize.\n"); + } + } + } + // For all canceled operations we should consider the possibility of + // use by with non-canceled operations and for that, it requires + // to populate ExternalUser list with canceled elements. + for (TreeEntry *Entry : VecNodes) + for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { + Value *Scalar = Entry->Scalars[Lane]; + for (User *U : Scalar->users()) { + LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n"); + if (!VecToScalars.count(U)) + continue; + // Ignore users in the user ignore list. + auto *UserInst = cast(U); + if (is_contained(UserIgnoreList, UserInst)) + continue; + LLVM_DEBUG(dbgs() << "SLP: Need to extract canceled operation :" << *U + << " from lane " << Lane << " from " << *Scalar + << ".\n"); + ExternalUses.emplace_back(Scalar, U, Lane); + } + } + return true; +} + unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const { unsigned N = 1; Type *EltTy = T; @@ -3283,7 +3454,7 @@ ReuseShuffleCost = TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy); } - if (E->State == TreeEntry::NeedToGather) { + if (E->State != TreeEntry::Vectorize) { if (allConstant(VL)) return 0; if (isSplat(VL)) { @@ -3712,18 +3883,17 @@ return true; } -int BoUpSLP::getSpillCost() const { +int BoUpSLP::getSpillCost() { // Walk from the bottom of the tree to the top, tracking which values are // live. When we see a call instruction that is not part of our tree, // query TTI to see if there is a cost to keeping values live over it // (for example, if spills and fills are required). - unsigned BundleWidth = VectorizableTree.front()->Scalars.size(); int Cost = 0; SmallPtrSet LiveValues; Instruction *PrevInst = nullptr; - for (const auto &TEPtr : VectorizableTree) { + for (const std::unique_ptr &TEPtr : VectorizableTree) { Instruction *Inst = dyn_cast(TEPtr->Scalars[0]); if (!Inst) continue; @@ -3736,7 +3906,7 @@ // Update LiveValues. LiveValues.erase(PrevInst); for (auto &J : PrevInst->operands()) { - if (isa(&*J) && getTreeEntry(&*J)) + if (isa(&*J) && ScalarsToVec.count(&*J)) LiveValues.insert(cast(&*J)); } @@ -3764,11 +3934,11 @@ !isa(&*PrevInstIt)) && &*PrevInstIt != PrevInst) NumCalls++; - ++PrevInstIt; } if (NumCalls) { + NoCallInst = false; SmallVector V; for (auto *II : LiveValues) V.push_back(VectorType::get(II->getType(), BundleWidth)); @@ -3781,15 +3951,132 @@ return Cost; } -int BoUpSLP::getTreeCost() { - int Cost = 0; +int BoUpSLP::getExtractOperationCost(const ExternalUser &EU) const { + // Uses by ephemeral values are free (because the ephemeral value will be + // removed prior to code generation, and so the extraction will be + // removed as well). + if (EphValues.count(EU.User)) + return 0; + + // If we plan to rewrite the tree in a smaller type, we will need to sign + // extend the extracted value back to the original type. Here, we account + // for the extract and the added cost of the sign extend if needed. + auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth); + Value *ScalarRoot = VectorizableTree.front()->Scalars[0]; + + auto It = MinBWs.find(ScalarRoot); + if (It != MinBWs.end()) { + uint64_t Width = It->second.first; + bool Signed = It->second.second; + auto *MinTy = IntegerType::get(F->getContext(), Width); + unsigned ExtOp = Signed ? Instruction::SExt : Instruction::ZExt; + VecTy = VectorType::get(MinTy, BundleWidth); + return (TTI->getExtractWithExtendCost(ExtOp, EU.Scalar->getType(), VecTy, + EU.Lane)); + } + return TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane); +} + +int BoUpSLP::getExtractCost() const { + int ExtractCost = 0; + SmallPtrSet ExtractCostCalculated; + // Consider the possibility of extracting vectorized + // values for canceled elements use. + for (const std::unique_ptr &TEPtr : VectorizableTree) { + TreeEntry *Entry = TEPtr.get(); + if (Entry->State != TreeEntry::ProposedToGather) + continue; + for (Value *V : Entry->Scalars) { + // Consider the possibility of extracting vectorized + // values for canceled elements use. + auto It = InternalTreeUses.find(V); + if (It != InternalTreeUses.end()) { + const UserList &UL = It->second; + for (const ExternalUser &IU : UL) + ExtractCost += getExtractOperationCost(IU); + } + } + } + for (const ExternalUser &EU : ExternalUses) { + // We only add extract cost once for the same scalar. + if (!ExtractCostCalculated.insert(EU.Scalar).second) + continue; + + int Cost = getExtractOperationCost(EU); + ExtractCost += Cost; + } + return ExtractCost; +} + +int BoUpSLP::getInsertCost() { + int InsertCost = 0; + for (const std::unique_ptr &TEPtr : VectorizableTree) { + TreeEntry *Entry = TEPtr.get(); + // Avoid already vectorized TreeEntries, it is already in a vector form and + // we don't need to gather those operations. + if (Entry->State != TreeEntry::ProposedToGather) + continue; + bool NeedGather = false; + for (Value *V : Entry->Scalars) { + auto *Inst = cast(V); + for (User *Op : Inst->users()) + if (ScalarsToVec.count(Op)) { + NeedGather = true; + break; + } + } + if (NeedGather) + InsertCost += getEntryCost(Entry); + } + return InsertCost; +} + +bool BoUpSLP::findSubTree(int UserCost) { + SmallVector Vec; + for (const std::unique_ptr &TEPtr : VectorizableTree) { + TreeEntry *Entry = TEPtr.get(); + if (Entry->State != TreeEntry::Vectorize || Entry->Cost <= 0 || !Entry->Idx) + continue; + Vec.push_back(Entry); + if (Vec.size() > MaxCostsRecalculations) + break; + } + llvm::sort(Vec, [&](const TreeEntry *LHS, const TreeEntry *RHS) { + return LHS->Cost > RHS->Cost; + }); + + for (TreeEntry *T : Vec) { + T->State = TreeEntry::ProposedToGather; + for (Value *V : T->Scalars) { + ScalarsToVec.erase(V); + VecToScalars.insert(V); + ScalarToTreeEntry.erase(V); + MustGather.insert(V); + ExternalUses.erase( + llvm::remove_if(ExternalUses, + [&V](ExternalUser &EU) { return EU.Scalar == V; }), + ExternalUses.end()); + } + int PartialCost = getTreeCost() - UserCost; + RemovedOperations.push_back(T); + if (PartialCost < -SLPCostThreshold && cutTree()) { + LLVM_DEBUG( + dbgs() << "SLP: Decided to partially vectorize tree with cost: " + << PartialCost << ".\n"); + return true; + } + } + return false; +} + +int BoUpSLP::getRawTreeCost() { + int CostSum = 0; + BundleWidth = VectorizableTree.front()->Scalars.size(); LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size " << VectorizableTree.size() << ".\n"); - unsigned BundleWidth = VectorizableTree[0]->Scalars.size(); - - for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) { - TreeEntry &TE = *VectorizableTree[I].get(); + for (std::unique_ptr &TEPtr : VectorizableTree) { + TreeEntry &TE = *TEPtr.get(); // We create duplicate tree entries for gather sequences that have multiple // uses. However, we should not compute the cost of duplicate sequences. @@ -3803,68 +4090,80 @@ // their uses. Since such an approach results in fewer total entries, // existing heuristics based on tree size may yield different results. // - if (TE.State == TreeEntry::NeedToGather && - std::any_of(std::next(VectorizableTree.begin(), I + 1), - VectorizableTree.end(), - [TE](const std::unique_ptr &EntryPtr) { - return EntryPtr->State == TreeEntry::NeedToGather && - EntryPtr->isSame(TE.Scalars); - })) + if (TE.State == TreeEntry::ProposedToGather) + VecToScalars.insert(TE.Scalars.begin(), TE.Scalars.end()); + if (TE.State != TreeEntry::Vectorize && + llvm::any_of(llvm::drop_begin(VectorizableTree, TE.Idx + 1), + [TE](const std::unique_ptr &EntryPtr) { + return EntryPtr->State != TreeEntry::Vectorize && + EntryPtr->isSame(TE.Scalars); + })) continue; - int C = getEntryCost(&TE); - LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C + if (TE.State == TreeEntry::Vectorize) + ScalarsToVec.insert(TE.Scalars.begin(), TE.Scalars.end()); + + TE.Cost = getEntryCost(&TE); + LLVM_DEBUG(dbgs() << "SLP: Adding cost " << TE.Cost << " for bundle that starts with " << *TE.Scalars[0] << ".\n"); - Cost += C; + CostSum += TE.Cost; } - SmallPtrSet ExtractCostCalculated; - int ExtractCost = 0; - for (ExternalUser &EU : ExternalUses) { - // We only add extract cost once for the same scalar. - if (!ExtractCostCalculated.insert(EU.Scalar).second) - continue; + if (SLPThrottling) + for (std::unique_ptr &TEPtr : VectorizableTree) { + TreeEntry *TE = TEPtr.get(); + if (TE->State != TreeEntry::Vectorize) + continue; + int GatherCost = 0; + for (TreeEntry *Gather : TE->UseEntries) + if (Gather->State != TreeEntry::Vectorize) + GatherCost += Gather->Cost; + TE->Cost += GatherCost; + } + return CostSum; +} - // Uses by ephemeral values are free (because the ephemeral value will be - // removed prior to code generation, and so the extraction will be - // removed as well). - if (EphValues.count(EU.User)) - continue; +int BoUpSLP::getTreeCost() { + int CostSum; + if (!IsCostSumReady) { + CostSum = getRawTreeCost(); + RawTreeCost = CostSum; + } else { + CostSum = RawTreeCost; + } - // If we plan to rewrite the tree in a smaller type, we will need to sign - // extend the extracted value back to the original type. Here, we account - // for the extract and the added cost of the sign extend if needed. - auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth); - auto *ScalarRoot = VectorizableTree[0]->Scalars[0]; - if (MinBWs.count(ScalarRoot)) { - auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first); - auto Extend = - MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt; - VecTy = VectorType::get(MinTy, BundleWidth); - ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(), - VecTy, EU.Lane); - } else { - ExtractCost += - TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane); + if (SLPThrottling) + for (std::unique_ptr &TEPtr : VectorizableTree) { + TreeEntry *TE = TEPtr.get(); + if (TE->State == TreeEntry::ProposedToGather) + CostSum -= TE->Cost; } - } - int SpillCost = getSpillCost(); - Cost += SpillCost + ExtractCost; + int ExtractCost = getExtractCost(); + int SpillCost = 0; + if (!NoCallInst || !IsCostSumReady) + SpillCost = getSpillCost(); +#ifndef NDEBUG + if (NoCallInst) + assert(getSpillCost() == 0 && "Incorrect spill cost"); +#endif + if (!IsCostSumReady) + IsCostSumReady = true; + int InsertCost = getInsertCost(); + int Cost = CostSum + ExtractCost + SpillCost + InsertCost; - std::string Str; - { - raw_string_ostream OS(Str); - OS << "SLP: Spill Cost = " << SpillCost << ".\n" - << "SLP: Extract Cost = " << ExtractCost << ".\n" - << "SLP: Total Cost = " << Cost << ".\n"; - } +#ifndef NDEBUG + SmallString<256> Str; + raw_svector_ostream OS(Str); + OS << "SLP: Spill Cost = " << SpillCost << ".\n" + << "SLP: Extract Cost = " << ExtractCost << ".\n" + << "SLP: Insert Cost = " << InsertCost << ".\n" + << "SLP: Total Cost = " << Cost << ".\n"; LLVM_DEBUG(dbgs() << Str); - if (ViewSLPTree) ViewGraph(this, "SLP" + F->getName(), false, Str); - +#endif return Cost; } @@ -4616,11 +4915,22 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { // All blocks must be scheduled before any instructions are inserted. for (auto &BSIter : BlocksSchedules) { - scheduleBlock(BSIter.second.get()); + BlockScheduling *BS = BSIter.second.get(); + // Remove all Schedule Data from all nodes that we have changed + // vectorization decision. + if (!RemovedOperations.empty()) + removeFromScheduling(BS); + scheduleBlock(BS); } Builder.SetInsertPoint(&F->getEntryBlock().front()); - auto *VectorRoot = vectorizeTree(VectorizableTree[0].get()); + auto *VectorRoot = vectorizeTree(VectorizableTree.front().get()); + + for (std::unique_ptr &TEPtr : VectorizableTree) { + TreeEntry *Entry = TEPtr.get(); + if (Entry->State == TreeEntry::Vectorize && !Entry->VectorizedValue) + vectorizeTree(Entry); + } // If the vectorized tree can be rewritten in a smaller type, we truncate the // vectorized root. InstCombine will then rewrite the entire expression. We @@ -4629,7 +4939,7 @@ if (MinBWs.count(ScalarRoot)) { if (auto *I = dyn_cast(VectorRoot)) Builder.SetInsertPoint(&*++BasicBlock::iterator(I)); - auto BundleWidth = VectorizableTree[0]->Scalars.size(); + BundleWidth = VectorizableTree.front()->Scalars.size(); auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first); auto *VecTy = VectorType::get(MinTy, BundleWidth); auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy); @@ -4729,7 +5039,7 @@ } // For each vectorized value: - for (auto &TEPtr : VectorizableTree) { + for (std::unique_ptr &TEPtr : VectorizableTree) { TreeEntry *Entry = TEPtr.get(); // No need to handle users of gathered values. @@ -4744,7 +5054,9 @@ #ifndef NDEBUG Type *Ty = Scalar->getType(); - if (!Ty->isVoidTy()) { + // The tree might not be fully vectorized, so we don't have to + // check every user. + if (!Ty->isVoidTy() && RemovedOperations.empty()) { for (User *U : Scalar->users()) { LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n"); @@ -5229,6 +5541,31 @@ ReadyInsts.clear(); } +void BoUpSLP::removeFromScheduling(BlockScheduling *BS) { + bool Removed = false; + for (TreeEntry *Entry : RemovedOperations) { + ScheduleData *SD = BS->getScheduleData(Entry->Scalars[0]); + if (SD && SD->isPartOfBundle()) { + if (!Removed) { + Removed = true; + BS->resetSchedule(); + } + BS->cancelScheduling(Entry->Scalars, SD->OpValue); + } + } + if (!Removed) + return; + BS->resetSchedule(); + BS->initialFillReadyList(BS->ReadyInsts); + for (Instruction *I = BS->ScheduleStart; I != BS->ScheduleEnd; + I = I->getNextNode()) { + if (BS->ScheduleDataMap.find(I) == BS->ScheduleDataMap.end()) + continue; + BS->doForAllOpcodes(I, + [&](ScheduleData *SD) { SD->clearDependencies(); }); + } +} + void BoUpSLP::scheduleBlock(BlockScheduling *BS) { if (!BS->ScheduleStart) return; @@ -5754,6 +6091,9 @@ R.vectorizeTree(); return true; + } else { + if (SLPThrottling && R.findSubTree()) + R.vectorizeTree(); } return false; @@ -6006,6 +6346,9 @@ I += VF - 1; NextInst = I + 1; Changed = true; + } else { + if (SLPThrottling && R.findSubTree(UserCost)) + R.vectorizeTree(); } } } @@ -6791,15 +7134,16 @@ int ReductionCost = getReductionCost(TTI, ReducedVals[i], ReduxWidth); int Cost = TreeCost + ReductionCost; if (Cost >= -SLPCostThreshold) { - V.getORE()->emit([&]() { - return OptimizationRemarkMissed( - SV_NAME, "HorSLPNotBeneficial", cast(VL[0])) - << "Vectorizing horizontal reduction is possible" - << "but not beneficial with cost " - << ore::NV("Cost", Cost) << " and threshold " - << ore::NV("Threshold", -SLPCostThreshold); - }); + if (!SLPThrottling || !V.findSubTree(-ReductionCost)) break; + V.getORE()->emit([&]() { + return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial", + cast(VL[0])) + << "Vectorizing horizontal reduction is possible" + << "but not beneficial with cost " << ore::NV("Cost", Cost) + << " and threshold " + << ore::NV("Threshold", -SLPCostThreshold); + }); } LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" Index: llvm/test/Transforms/SLPVectorizer/X86/powof2div.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/powof2div.ll +++ llvm/test/Transforms/SLPVectorizer/X86/powof2div.ll @@ -60,35 +60,34 @@ define void @powof2div_nonuniform(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c){ ; AVX1-LABEL: @powof2div_nonuniform( ; AVX1-NEXT: entry: -; AVX1-NEXT: [[TMP0:%.*]] = load i32, i32* [[B:%.*]], align 4 -; AVX1-NEXT: [[TMP1:%.*]] = load i32, i32* [[C:%.*]], align 4 -; AVX1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]] -; AVX1-NEXT: [[DIV:%.*]] = sdiv i32 [[ADD]], 2 -; AVX1-NEXT: store i32 [[DIV]], i32* [[A:%.*]], align 4 -; AVX1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 1 -; AVX1-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4 -; AVX1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 1 -; AVX1-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX4]], align 4 -; AVX1-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP3]], [[TMP2]] -; AVX1-NEXT: [[DIV6:%.*]] = sdiv i32 [[ADD5]], 4 -; AVX1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 1 -; AVX1-NEXT: store i32 [[DIV6]], i32* [[ARRAYIDX7]], align 4 +; AVX1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1 +; AVX1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 1 ; AVX1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2 -; AVX1-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX8]], align 4 ; AVX1-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 2 -; AVX1-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX9]], align 4 -; AVX1-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP5]], [[TMP4]] -; AVX1-NEXT: [[DIV11:%.*]] = sdiv i32 [[ADD10]], 8 -; AVX1-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2 -; AVX1-NEXT: store i32 [[DIV11]], i32* [[ARRAYIDX12]], align 4 ; AVX1-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3 -; AVX1-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX13]], align 4 +; AVX1-NEXT: [[TMP0:%.*]] = bitcast i32* [[B]] to <4 x i32>* +; AVX1-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 ; AVX1-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3 -; AVX1-NEXT: [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX14]], align 4 -; AVX1-NEXT: [[ADD15:%.*]] = add nsw i32 [[TMP7]], [[TMP6]] -; AVX1-NEXT: [[DIV16:%.*]] = sdiv i32 [[ADD15]], 16 +; AVX1-NEXT: [[TMP2:%.*]] = bitcast i32* [[C]] to <4 x i32>* +; AVX1-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4 +; AVX1-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP1]] +; AVX1-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0 +; AVX1-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP5]], 2 +; AVX1-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1 +; AVX1-NEXT: [[DIV6:%.*]] = sdiv i32 [[TMP6]], 4 +; AVX1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 1 +; AVX1-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2 +; AVX1-NEXT: [[DIV11:%.*]] = sdiv i32 [[TMP7]], 8 +; AVX1-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2 +; AVX1-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3 +; AVX1-NEXT: [[DIV16:%.*]] = sdiv i32 [[TMP8]], 16 ; AVX1-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3 -; AVX1-NEXT: store i32 [[DIV16]], i32* [[ARRAYIDX17]], align 4 +; AVX1-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> undef, i32 [[DIV]], i32 0 +; AVX1-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[DIV6]], i32 1 +; AVX1-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[DIV11]], i32 2 +; AVX1-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[DIV16]], i32 3 +; AVX1-NEXT: [[TMP13:%.*]] = bitcast i32* [[A]] to <4 x i32>* +; AVX1-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* [[TMP13]], align 4 ; AVX1-NEXT: ret void ; ; AVX2-LABEL: @powof2div_nonuniform( Index: llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll +++ llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll @@ -5,18 +5,20 @@ ; CHECK-LABEL: @rftbsub( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 2 -; CHECK-NEXT: [[TMP0:%.*]] = load double, double* [[ARRAYIDX6]], align 8 -; CHECK-NEXT: [[TMP1:%.*]] = or i64 2, 1 -; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP2:%.*]] = load double, double* [[ARRAYIDX12]], align 8 -; CHECK-NEXT: [[ADD16:%.*]] = fadd double [[TMP2]], undef +; CHECK-NEXT: [[TMP0:%.*]] = or i64 2, 1 +; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[ARRAYIDX6]] to <2 x double>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 1 +; CHECK-NEXT: [[ADD16:%.*]] = fadd double [[TMP3]], undef ; CHECK-NEXT: [[MUL18:%.*]] = fmul double undef, [[ADD16]] ; CHECK-NEXT: [[ADD19:%.*]] = fadd double undef, [[MUL18]] ; CHECK-NEXT: [[SUB22:%.*]] = fsub double undef, undef -; CHECK-NEXT: [[SUB25:%.*]] = fsub double [[TMP0]], [[ADD19]] -; CHECK-NEXT: store double [[SUB25]], double* [[ARRAYIDX6]], align 8 -; CHECK-NEXT: [[SUB29:%.*]] = fsub double [[TMP2]], [[SUB22]] -; CHECK-NEXT: store double [[SUB29]], double* [[ARRAYIDX12]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> undef, double [[ADD19]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[SUB22]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[ARRAYIDX6]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP6]], <2 x double>* [[TMP7]], align 8 ; CHECK-NEXT: unreachable ; entry: