Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -517,11 +517,14 @@ /// \returns the cost incurred by unwanted spills and fills, caused by /// holding live values over call sites. - int getSpillCost(); + int getSpillCost(const SmallPtrSetImpl &ScalarsToVec); + + /// \returns the cost extracting vectorized elements. + int getExtractCost(const SmallPtrSetImpl &ScalarsToVec); /// \returns the vectorization cost of the subtree that starts at \p VL. /// A negative number means that this is profitable. - int getTreeCost(); + int getTreeCost(bool ReduceTree = false); /// Construct a vectorizable tree that starts at \p Roots, ignoring users for /// the purpose of scheduling and extraction in the \p UserIgnoreLst. @@ -542,6 +545,8 @@ ScalarToTreeEntry.clear(); MustGather.clear(); ExternalUses.clear(); + InternalTreeUses.clear(); + RemovedOperations.clear(); NumOpsWantToKeepOrder.clear(); NumOpsWantToKeepOriginalOrder = 0; for (auto &Iter : BlocksSchedules) { @@ -601,6 +606,9 @@ /// vectorizable. We do not vectorize such trees. bool isTreeTinyAndNotFullyVectorizable(); + /// Reduce the cost of tree to make it partially vectorizable, if possible. + bool reduceTree(); + OptimizationRemarkEmitter *getORE() { return ORE; } private: @@ -701,6 +709,13 @@ /// The TreeEntry index containing the user of this entry. We can actually /// have multiple users so the data structure is not truly a tree. SmallVector UserTreeIndices; + + /// Cost of the tree entry. + int Cost = 0; + + /// Full cost on scalarizing above this node, due to the reduction of the + /// tree with throttling. + int ScalarizeAtCost = 0; }; /// Create a new VectorizableTree entry. @@ -743,6 +758,9 @@ /// Maps a specific scalar to its tree entry. SmallDenseMap ScalarToTreeEntry; + /// Tree entries that should not be vectorized due to throttling. + SmallVector RemovedOperations; + /// A list of scalars that we found that we need to keep as scalars. ValueSet MustGather; @@ -762,6 +780,9 @@ }; using UserList = SmallVector; + /// \returns the cost extracting vectorized element. + int getExtractOperationCost(ExternalUser &EU); + /// Checks if two instructions may access the same memory. /// /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it @@ -813,6 +834,11 @@ /// after vectorization. UserList ExternalUses; + /// A list of values that possible need to extracted out of the tree, in + /// case the tree was reduced and there might be use of canceled operations + /// of operations still proposed to vectorize. + SmallDenseMap InternalTreeUses; + /// Values used only by @llvm.assume calls. SmallPtrSet EphValues; @@ -1171,6 +1197,9 @@ /// Attaches the BlockScheduling structures to basic blocks. MapVector> BlocksSchedules; + /// Remove operations from the list of proposed to schedule. + void removeFromScheduling(BlockScheduling *BS); + /// Performs the "real" scheduling. Done before vectorization is actually /// performed in a basic block. void scheduleBlock(BlockScheduling *BS); @@ -1376,6 +1405,7 @@ LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U << ".\n"); assert(!UseEntry->NeedToGather && "Bad state"); + InternalTreeUses[U].push_back(ExternalUser(Scalar, U, FoundLane)); continue; } } @@ -2419,6 +2449,90 @@ } } +bool BoUpSLP::reduceTree() { + // Estimating where to stop vectorization. + unsigned StopAt = VectorizableTree.size() - 1; + int MinCost = VectorizableTree.back().ScalarizeAtCost; + + int NonGatherNodes = 0; + SmallVector NonGather; + for (unsigned I = 0, E = VectorizableTree.size(); I < E; I++) { + TreeEntry *Entry = &VectorizableTree[I]; + if (!Entry->NeedToGather) + NonGatherNodes++; + NonGather.push_back(NonGatherNodes); + } + + for (unsigned I = VectorizableTree.size(); I--;) { + TreeEntry *Entry = &VectorizableTree[I]; + if (Entry->NeedToGather) + continue; + // Stop at the first profitable element, also avoid tree with + // just one seed operation, which is unprofitable. + if (VectorizableTree[I].ScalarizeAtCost < -SLPCostThreshold && + NonGather[I]>1) { + MinCost = VectorizableTree[I].ScalarizeAtCost; + StopAt = I; + break; + } + } + + if (StopAt == (VectorizableTree.size() - 1)) + return false; + + // Canceling unprofitable elements. + SmallPtrSet Removed; + int ReducedBy = MinCost - VectorizableTree.back().ScalarizeAtCost; + LLVM_DEBUG(dbgs() << "SLP: Reduced the tree cost by " << ReducedBy + << " to make it partially vectorizable.\n"); + for (unsigned I = StopAt + 1, E = VectorizableTree.size(); I < E; I++) { + TreeEntry *Entry = &VectorizableTree[I]; + if (Entry->NeedToGather) + continue; + Entry->NeedToGather = true; + for (Value *V : Entry->Scalars) { + LLVM_DEBUG(dbgs() << "SLP: Remove scalar " << *V + << " out of proposed to vectorize.\n"); + ScalarToTreeEntry.erase(V); + Removed.insert(V); + RemovedOperations.push_back(I); + MustGather.insert(V); + ExternalUses.erase( + std::remove_if(ExternalUses.begin(), ExternalUses.end(), + [&](ExternalUser &EU) { return EU.Scalar == V; }), + ExternalUses.end()); + } + } + + // For all canceled operations we should consider the possibility of + // use by with non-canceled operations and for that, it requires + // to populate ExternalUser list with canceled elements. + for (unsigned I = 0, E = StopAt; I <= E; I++) { + TreeEntry *Entry = &VectorizableTree[I]; + if (Entry->NeedToGather) + continue; + for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { + Value *Scalar = Entry->Scalars[Lane]; + for (User *U : Scalar->users()) { + LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n"); + Instruction *UserInst = dyn_cast(U); + if (!UserInst) + continue; + if (Removed.count(U) == 0) + continue; + // Ignore users in the user ignore list. + if (is_contained(UserIgnoreList, UserInst)) + continue; + LLVM_DEBUG(dbgs() << "SLP: Need to extract canceled operation :" + << *U << " from lane " << Lane + << " from " << *Scalar << ".\n"); + ExternalUses.push_back(ExternalUser(Scalar, U, Lane)); + } + } + } + return true; +} + bool BoUpSLP::isFullyVectorizableTinyTree() { LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height " << VectorizableTree.size() << " is fully vectorizable .\n"); @@ -2463,7 +2577,7 @@ return true; } -int BoUpSLP::getSpillCost() { +int BoUpSLP::getSpillCost(const SmallPtrSetImpl &ScalarsToVec) { // Walk from the bottom of the tree to the top, tracking which values are // live. When we see a call instruction that is not part of our tree, // query TTI to see if there is a cost to keeping values live over it @@ -2487,7 +2601,7 @@ // Update LiveValues. LiveValues.erase(PrevInst); for (auto &J : PrevInst->operands()) { - if (isa(&*J) && getTreeEntry(&*J)) + if (isa(&*J) && ScalarsToVec.count(&*J) > 0) LiveValues.insert(cast(&*J)); } @@ -2528,13 +2642,60 @@ return Cost; } -int BoUpSLP::getTreeCost() { - int Cost = 0; - LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size " - << VectorizableTree.size() << ".\n"); +int BoUpSLP::getExtractOperationCost(ExternalUser &EU) { + unsigned BundleWidth = VectorizableTree.front().Scalars.size(); - unsigned BundleWidth = VectorizableTree[0].Scalars.size(); + // Uses by ephemeral values are free (because the ephemeral value will be + // removed prior to code generation, and so the extraction will be + // removed as well). + if (EphValues.count(EU.User)) + return 0; + + // If we plan to rewrite the tree in a smaller type, we will need to sign + // extend the extracted value back to the original type. Here, we account + // for the extract and the added cost of the sign extend if needed. + auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth); + auto *ScalarRoot = VectorizableTree[0].Scalars[0]; + + if (MinBWs.count(ScalarRoot)) { + auto *MinTy = IntegerType::get(F->getContext(), + MinBWs[ScalarRoot].first); + auto Extend = + MinBWs[ScalarRoot].second ? + Instruction::SExt : Instruction::ZExt; + VecTy = VectorType::get(MinTy, BundleWidth); + return (TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(), + VecTy, EU.Lane)); + } + return TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane); +} +int BoUpSLP::getExtractCost(const SmallPtrSetImpl &ScalarsToVec) { + int ExtractCost = 0; + SmallPtrSet ExtractCostCalculated; + for (ExternalUser &EU : ExternalUses) { + // We only add extract cost once for the same scalar. + if (!ExtractCostCalculated.insert(EU.Scalar).second) + continue; + + // Avoid non-vectorized scalars for this tree hight. + if (ScalarsToVec.count(EU.Scalar) == 0) { + // Consider the possibility of extracting vectorized + // values for canceled elements use. + if (InternalTreeUses.find(EU.Scalar) != InternalTreeUses.end()) + for (ExternalUser &IU : InternalTreeUses[EU.Scalar]) + ExtractCost += getExtractOperationCost(IU); + continue; + } + + ExtractCost += getExtractOperationCost(EU); + } + return ExtractCost; +} + +int BoUpSLP::getTreeCost(bool ReduceTree) { + SmallDenseMap> UseTreeIndices; + unsigned LastNonGather = 0; for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) { TreeEntry &TE = VectorizableTree[I]; @@ -2557,46 +2718,58 @@ })) continue; - int C = getEntryCost(&TE); - LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C + if (TE.NeedToGather) { + for (int UseTreeIdx: TE.UserTreeIndices) + UseTreeIndices[UseTreeIdx].push_back(I); + } else { + LastNonGather++; + } + + TE.Cost = getEntryCost(&TE); + LLVM_DEBUG(dbgs() << "SLP: Adding cost " << TE.Cost << " for bundle that starts with " << *TE.Scalars[0] << ".\n"); - Cost += C; } - SmallPtrSet ExtractCostCalculated; + SmallPtrSet ScalarsToVec; + int CostSum = 0; int ExtractCost = 0; - for (ExternalUser &EU : ExternalUses) { - // We only add extract cost once for the same scalar. - if (!ExtractCostCalculated.insert(EU.Scalar).second) - continue; + int SpillCost = 0; + unsigned NonGatherIdx = 0; - // Uses by ephemeral values are free (because the ephemeral value will be - // removed prior to code generation, and so the extraction will be - // removed as well). - if (EphValues.count(EU.User)) - continue; + for (unsigned I = 0, E = VectorizableTree.size(); I < E; I++) { + TreeEntry *Entry = &VectorizableTree[I]; - // If we plan to rewrite the tree in a smaller type, we will need to sign - // extend the extracted value back to the original type. Here, we account - // for the extract and the added cost of the sign extend if needed. - auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth); - auto *ScalarRoot = VectorizableTree[0].Scalars[0]; - if (MinBWs.count(ScalarRoot)) { - auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first); - auto Extend = - MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt; - VecTy = VectorType::get(MinTy, BundleWidth); - ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(), - VecTy, EU.Lane); - } else { - ExtractCost += - TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane); + int GatherCost = 0; + if (!Entry->NeedToGather) { + NonGatherIdx++; + for (Value *V : Entry->Scalars) + ScalarsToVec.insert(V); + auto It = UseTreeIndices.find(I); + if (It != UseTreeIndices.end()) + for (int Gather : It->second) + GatherCost += VectorizableTree[Gather].Cost; + + CostSum += Entry->Cost + GatherCost; + } + + if (ReduceTree || I == (E - 1)) { + Entry->ScalarizeAtCost = CostSum; + + // Avoid adding gather cost for the highest vectorizable element of + // the tree and otherwise for entering to vectorization chain + // from non-first position of the tree we must consider gathering + // cost for this element. + if (ReduceTree && NonGatherIdx != LastNonGather) + Entry->ScalarizeAtCost += getGatherCost(Entry->Scalars); + ExtractCost = getExtractCost(ScalarsToVec); + Entry->ScalarizeAtCost += ExtractCost; + SpillCost = getSpillCost(ScalarsToVec); + Entry->ScalarizeAtCost += SpillCost; } } - int SpillCost = getSpillCost(); - Cost += SpillCost + ExtractCost; + int Cost = VectorizableTree.back().ScalarizeAtCost; std::string Str; { @@ -3580,7 +3753,12 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { // All blocks must be scheduled before any instructions are inserted. for (auto &BSIter : BlocksSchedules) { - scheduleBlock(BSIter.second.get()); + BlockScheduling *BS = BSIter.second.get(); + // Remove all Schedule Data from all nodes that we have changed + // vectorization decision. + if (RemovedOperations.size()) + removeFromScheduling(BS); + scheduleBlock(BS); } Builder.SetInsertPoint(&F->getEntryBlock().front()); @@ -3709,13 +3887,16 @@ Type *Ty = Scalar->getType(); if (!Ty->isVoidTy()) { #ifndef NDEBUG - for (User *U : Scalar->users()) { - LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n"); - - // It is legal to replace users in the ignorelist by undef. - assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) && - "Replacing out-of-tree value with undef"); - } + // The tree might not be fully vectorized, so we don't have to + // check every user. + if (!RemovedOperations.size()) + for (User *U : Scalar->users()) { + LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n"); + + // It is legal to replace users in the ignorelist by undef. + assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) && + "Replacing out-of-tree value with undef"); + } #endif Value *Undef = UndefValue::get(Ty); Scalar->replaceAllUsesWith(Undef); @@ -4192,6 +4373,32 @@ ReadyInsts.clear(); } +void BoUpSLP::removeFromScheduling(BlockScheduling *BS) { + bool Removed = false; + for (int I : RemovedOperations) { + TreeEntry *Entry = &VectorizableTree[I]; + ScheduleData *SD = BS->getScheduleData(Entry->Scalars[0]); + if (SD && SD->isPartOfBundle()) { + if (!Removed) { + Removed = true; + BS->resetSchedule(); + } + BS->cancelScheduling(Entry->Scalars, SD->OpValue); + } + } + if (!Removed) + return; + BS->resetSchedule(); + BS->initialFillReadyList(BS->ReadyInsts); + for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; + I = I->getNextNode()) { + if (BS->ScheduleDataMap.find(I) == BS->ScheduleDataMap.end()) + continue; + BS->doForAllOpcodes(I, + [&](ScheduleData *SD) { SD->clearDependencies(); }); + } +} + void BoUpSLP::scheduleBlock(BlockScheduling *BS) { if (!BS->ScheduleStart) return; @@ -4688,7 +4895,16 @@ const unsigned ChainLen = Chain.size(); LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen << "\n"); - const unsigned Sz = R.getVectorElementSize(Chain[0]); + Value *FirstStore = nullptr; + for (Value *V : Chain) { + assert(isa(V) && "Expected only StoreInst here!"); + if (StoreInst *SI = cast(V)) + if (SI->getValueOperand()) + FirstStore = V; + } + if (!FirstStore) + return false; + const unsigned Sz = R.getVectorElementSize(FirstStore); const unsigned VF = VecRegSize / Sz; if (!isPowerOf2_32(Sz) || VF < 2) @@ -4709,6 +4925,13 @@ << "\n"); ArrayRef Operands = Chain.slice(i, VF); + // Skip if any store instruction vectorized. + if (std::any_of(Operands.begin(), Operands.end(), + [](Value *V) { + return (!(cast(V))->getValueOperand()); + })) + continue; + R.buildTree(Operands); if (R.isTreeTinyAndNotFullyVectorizable()) continue; @@ -4735,6 +4958,13 @@ // Move to the next bundle. i += VF - 1; Changed = true; + } else { + Cost = R.getTreeCost(true); + Changed = R.reduceTree(); + if (Changed) { + R.vectorizeTree(); + i += VF - 1; + } } } Index: test/Transforms/SLPVectorizer/X86/slp-throttle.ll =================================================================== --- /dev/null +++ test/Transforms/SLPVectorizer/X86/slp-throttle.ll @@ -0,0 +1,39 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck %s + +define dso_local void @rftbsub(double* %a) local_unnamed_addr #0 { +; CHECK-LABEL: @rftbsub( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 2 +; CHECK-NEXT: [[TMP0:%.*]] = or i64 2, 1 +; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[ARRAYIDX6]] to <2 x double>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 1 +; CHECK-NEXT: [[ADD16:%.*]] = fadd double [[TMP3]], undef +; CHECK-NEXT: [[MUL18:%.*]] = fmul double undef, [[ADD16]] +; CHECK-NEXT: [[ADD19:%.*]] = fadd double undef, [[MUL18]] +; CHECK-NEXT: [[SUB22:%.*]] = fsub double undef, undef +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> undef, double [[ADD19]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[SUB22]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[ARRAYIDX6]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP6]], <2 x double>* [[TMP7]], align 8 +; CHECK-NEXT: unreachable +; +entry: + %arrayidx6 = getelementptr inbounds double, double* %a, i64 2 + %0 = load double, double* %arrayidx6, align 8 + %1 = or i64 2, 1 + %arrayidx12 = getelementptr inbounds double, double* %a, i64 %1 + %2 = load double, double* %arrayidx12, align 8 + %add16 = fadd double %2, undef + %mul18 = fmul double undef, %add16 + %add19 = fadd double undef, %mul18 + %sub22 = fsub double undef, undef + %sub25 = fsub double %0, %add19 + store double %sub25, double* %arrayidx6, align 8 + %sub29 = fsub double %2, %sub22 + store double %sub29, double* %arrayidx12, align 8 + unreachable +}