Index: llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -117,6 +117,15 @@ ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions")); +static cl::opt + SLPThrottling("slp-throttle", cl::init(true), cl::Hidden, + cl::desc("Enable tree partial vectorize with throttling")); + +static cl::opt + MaxCostsRecalculations("slp-throttling-budget", cl::init(32), cl::Hidden, + cl::desc("Limit the total number of nodes for cost " + "recalculations during throttling")); + static cl::opt ShouldStartVectorizeHorAtStore( "slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc( @@ -570,6 +579,8 @@ MinVecRegSize = MinVectorRegSizeOption; else MinVecRegSize = TTI->getMinVectorRegisterBitWidth(); + BuiltTrees.push_back(std::make_unique()); + Tree = BuiltTrees.back().get(); } /// Vectorize the tree that starts with the elements in \p VL. @@ -583,7 +594,51 @@ /// \returns the cost incurred by unwanted spills and fills, caused by /// holding live values over call sites. - int getSpillCost() const; + int getSpillCost(); + + /// \returns the cost extracting vectorized elements. + int getExtractCost() const; + + /// \returns the cost of gathering canceled elements to be used + /// by vectorized operations during throttling. + int getInsertCost(); + + /// Find a subtree of the whole tree suitable to be vectorized. When + /// vectorizing the whole tree is not profitable, we can consider vectorizing + /// part of that tree. SLP algorithm looks to operations to vectorize starting + /// from seed instructions on the bottom toward the end of chains of + /// dependencies to the top of SLP graph, it groups potentially vectorizable + /// operations in scalar form to bundles. + /// For example: + /// + /// scalar form + /// | + /// scalar form scalar form + /// \ / + /// scalar form + /// + /// Total cost is not profitable to vectorize, hence all operations are in + /// scalar form. + /// + /// Here is the same tree after SLP throttling transformation: + /// + /// vector form + /// | + /// vector form scalar form + /// \ / + /// vector form + /// + /// So, we can throttle some operations in such a way that it is still + /// profitable to vectorize part on the tree, while all tree vectorization + /// does not make sense. + /// More details: https://www.cl.cam.ac.uk/~tmj32/papers/docs/porpodas15-pact.pdf + bool findSubTree(int UserCost = 0); + + /// Explore SLP graph non-gathering nodes structure. + void treeTraversal(); + + /// Get raw summary of all elements of the tree. + int getRawTreeCost(); /// \returns the vectorization cost of the subtree that starts at \p VL. /// A negative number means that this is profitable. @@ -602,44 +657,35 @@ ExtraValueToDebugLocsMap &ExternallyUsedValues, ArrayRef UserIgnoreLst = None); - /// Clear the internal data structures that are created by 'buildTree'. - void deleteTree() { - VectorizableTree.clear(); - ScalarToTreeEntry.clear(); - MustGather.clear(); - ExternalUses.clear(); - NumOpsWantToKeepOrder.clear(); - NumOpsWantToKeepOriginalOrder = 0; - for (auto &Iter : BlocksSchedules) { - BlockScheduling *BS = Iter.second.get(); - BS->clear(); - } - MinBWs.clear(); + /// Save current tree for possible later vectorization. + void saveTree() { + BuiltTrees.push_back(std::make_unique()); + Tree = BuiltTrees.back().get(); } - unsigned getTreeSize() const { return VectorizableTree.size(); } + unsigned getTreeSize() const { return Tree->VectorizableTree.size(); } /// Perform LICM and CSE on the newly generated gather sequences. void optimizeGatherSequence(); /// \returns The best order of instructions for vectorization. Optional> bestOrder() const { - assert(llvm::all_of( - NumOpsWantToKeepOrder, - [this](const decltype(NumOpsWantToKeepOrder)::value_type &D) { - return D.getFirst().size() == - VectorizableTree[0]->Scalars.size(); - }) && + assert(llvm::all_of(Tree->NumOpsWantToKeepOrder, + [this](const decltype( + Tree->NumOpsWantToKeepOrder)::value_type &D) { + return D.getFirst().size() == + Tree->VectorizableTree[0]->Scalars.size(); + }) && "All orders must have the same size as number of instructions in " "tree node."); auto I = std::max_element( - NumOpsWantToKeepOrder.begin(), NumOpsWantToKeepOrder.end(), - [](const decltype(NumOpsWantToKeepOrder)::value_type &D1, - const decltype(NumOpsWantToKeepOrder)::value_type &D2) { + Tree->NumOpsWantToKeepOrder.begin(), Tree->NumOpsWantToKeepOrder.end(), + [](const decltype(Tree->NumOpsWantToKeepOrder)::value_type &D1, + const decltype(Tree->NumOpsWantToKeepOrder)::value_type &D2) { return D1.second < D2.second; }); - if (I == NumOpsWantToKeepOrder.end() || - I->getSecond() <= NumOpsWantToKeepOriginalOrder) + if (I == Tree->NumOpsWantToKeepOrder.end() || + I->getSecond() <= Tree->NumOpsWantToKeepOriginalOrder) return None; return makeArrayRef(I->getFirst()); @@ -661,7 +707,7 @@ void findRootOrder(OrdersType &Order) { // If the leaf has the same number of instructions to vectorize as the root // - order must be set already. - unsigned RootSize = VectorizableTree[0]->Scalars.size(); + unsigned RootSize = Tree->VectorizableTree[0]->Scalars.size(); if (Order.size() == RootSize) return; SmallVector RealOrder(Order.size()); @@ -672,7 +718,7 @@ // The leaf has less number of instructions - need to find the true order of // the root. // Scan the nodes starting from the leaf back to the root. - const TreeEntry *PNode = VectorizableTree.back().get(); + const TreeEntry *PNode = Tree->VectorizableTree.back().get(); SmallVector Nodes(1, PNode); SmallPtrSet Visited; while (!Nodes.empty() && Order.size() != RootSize) { @@ -771,6 +817,12 @@ /// may not be necessary. bool isLoadCombineCandidate() const; + /// Cut the tree to make it partially vectorizable. + void cutTree(); + + /// Try partially vectorize the tree via throttling. + bool tryPartialVectorization(); + OptimizationRemarkEmitter *getORE() { return ORE; } /// This structure holds any data we need about the edges being traversed @@ -1554,8 +1606,14 @@ /// Do we need to gather this sequence or vectorize it /// (either with vector instruction or with scatter/gather - /// intrinsics for store/load)? - enum EntryState { Vectorize, ScatterVectorize, NeedToGather }; + /// intrinsics for store/load)? Or mark the sequence as once + /// proposed to vectorize. + enum EntryState { + Vectorize, + ScatterVectorize, + NeedToGather, + ProposedToGather + }; EntryState State; /// Does this sequence require some shuffling? @@ -1564,6 +1622,12 @@ /// Does this entry require reordering? SmallVector ReorderIndices; + /// Cost of this tree entry. + int Cost = 0; + + /// Distance from this entry to the root of the tree. + unsigned Distance = 0; + /// Points back to the VectorizableTree. /// /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has @@ -1576,6 +1640,9 @@ /// have multiple users so the data structure is not truly a tree. SmallVector UserTreeIndices; + /// Use of this entry. + TinyPtrVector UseEntries; + /// The index of this treeEntry in VectorizableTree. int Idx = -1; @@ -1709,6 +1776,9 @@ case NeedToGather: dbgs() << "NeedToGather\n"; break; + case ProposedToGather: + dbgs() << "ProposedToGather\n"; + break; } dbgs() << "MainOp: "; if (MainOp) @@ -1779,9 +1849,10 @@ assert(((!Bundle && EntryState == TreeEntry::NeedToGather) || (Bundle && EntryState != TreeEntry::NeedToGather)) && "Need to vectorize gather entry?"); - VectorizableTree.push_back(std::make_unique(VectorizableTree)); - TreeEntry *Last = VectorizableTree.back().get(); - Last->Idx = VectorizableTree.size() - 1; + Tree->VectorizableTree.push_back( + std::make_unique(Tree->VectorizableTree)); + TreeEntry *Last = Tree->VectorizableTree.back().get(); + Last->Idx = Tree->VectorizableTree.size() - 1; Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end()); Last->State = EntryState; Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(), @@ -1791,7 +1862,7 @@ if (Last->State != TreeEntry::NeedToGather) { for (Value *V : VL) { assert(!getTreeEntry(V) && "Scalar already in tree!"); - ScalarToTreeEntry[V] = Last; + Tree->ScalarToTreeEntry[V] = Last; } // Update the scheduler bundle to point to this TreeEntry. unsigned Lane = 0; @@ -1804,52 +1875,46 @@ assert((!Bundle.getValue() || Lane == VL.size()) && "Bundle and VL out of sync"); } else { - MustGather.insert(VL.begin(), VL.end()); + Tree->MustGather.insert(VL.begin(), VL.end()); } - if (UserTreeIdx.UserTE) + if (UserTreeIdx.UserTE) { Last->UserTreeIndices.push_back(UserTreeIdx); + Tree->VectorizableTree[UserTreeIdx.UserTE->Idx]->UseEntries.push_back( + Last); + } return Last; } - /// -- Vectorization State -- - /// Holds all of the tree entries. - TreeEntry::VecTreeTy VectorizableTree; - #ifndef NDEBUG /// Debug printer. LLVM_DUMP_METHOD void dumpVectorizableTree() const { - for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) { - VectorizableTree[Id]->dump(); + for (unsigned Id = 0, IdE = Tree->VectorizableTree.size(); Id != IdE; + ++Id) { + Tree->VectorizableTree[Id]->dump(); dbgs() << "\n"; } } #endif TreeEntry *getTreeEntry(Value *V) { - auto I = ScalarToTreeEntry.find(V); - if (I != ScalarToTreeEntry.end()) + auto I = Tree->ScalarToTreeEntry.find(V); + if (I != Tree->ScalarToTreeEntry.end()) return I->second; return nullptr; } const TreeEntry *getTreeEntry(Value *V) const { - auto I = ScalarToTreeEntry.find(V); - if (I != ScalarToTreeEntry.end()) + auto I = Tree->ScalarToTreeEntry.find(V); + if (I != Tree->ScalarToTreeEntry.end()) return I->second; return nullptr; } - /// Maps a specific scalar to its tree entry. - SmallDenseMap ScalarToTreeEntry; - /// Maps a value to the proposed vectorizable size. SmallDenseMap InstrElementSize; - /// A list of scalars that we found that we need to keep as scalars. - ValueSet MustGather; - /// This POD struct describes one external user in the vectorized tree. struct ExternalUser { ExternalUser(Value *S, llvm::User *U, int L) @@ -1866,6 +1931,9 @@ }; using UserList = SmallVector; + /// \returns the cost of extracting the vectorized elements. + int getExtractOperationCost(const ExternalUser &EU) const; + /// Checks if two instructions may access the same memory. /// /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it @@ -1910,12 +1978,6 @@ /// eventually when the BoUpSLP is destructed. DenseMap DeletedInstructions; - /// A list of values that need to extracted out of the tree. - /// This list holds pairs of (Internal Scalar : External User). External User - /// can be nullptr, it means that this Internal Scalar will be used later, - /// after vectorization. - UserList ExternalUses; - /// Values used only by @llvm.assume calls. SmallPtrSet EphValues; @@ -2313,8 +2375,8 @@ int SchedulingRegionID = 1; }; - /// Attaches the BlockScheduling structures to basic blocks. - MapVector> BlocksSchedules; + /// Remove operations from the list of proposed to schedule. + void removeFromScheduling(BlockScheduling *BS); /// Performs the "real" scheduling. Done before vectorization is actually /// performed in a basic block. @@ -2347,13 +2409,98 @@ } }; - /// Contains orders of operations along with the number of bundles that have - /// operations in this order. It stores only those orders that require - /// reordering, if reordering is not required it is counted using \a - /// NumOpsWantToKeepOriginalOrder. - DenseMap NumOpsWantToKeepOrder; - /// Number of bundles that do not require reordering. - unsigned NumOpsWantToKeepOriginalOrder = 0; + /// Tree state that created by 'buildTree'. + struct TreeState { + using TreeStateTy = SmallVector, 2>; + + /// -- Vectorization State -- + /// Holds all of the tree entries. + TreeEntry::VecTreeTy VectorizableTree; + + /// Maps a specific scalar to its tree entry. + SmallDenseMap ScalarToTreeEntry; + + /// Tree entries that should not be vectorized due to throttling. + SmallVector RemovedOperations; + + /// Contains orders of operations along with the number of bundles that have + /// operations in this order. It stores only those orders that require + /// reordering, if reordering is not required it is counted using \a + /// NumOpsWantToKeepOriginalOrder. + DenseMap + NumOpsWantToKeepOrder; + /// Number of bundles that do not require reordering. + unsigned NumOpsWantToKeepOriginalOrder = 0; + + /// A list of scalars that we found that we need to keep as scalars. + ValueSet MustGather; + + /// Raw cost of all elemts in the tree. + int RawTreeCost = 0; + + /// Final cost of the tree. + int TreeCost = 0; + + /// Partail cost of the tree. + int PartialCost = 0; + + /// Indicate that no CallInst found in the tree and we don't need to + /// calculate spill cost. + bool NoCallInst = true; + + /// True, if we have calucalte tree cost for the tree. + bool IsCostSumReady = false; + + /// A list of values that need to extracted out of the tree. + /// This list holds pairs of (Internal Scalar : External User). External + /// User can be nullptr, it means that this Internal Scalar will be used + /// later, after vectorization. + UserList ExternalUses; + + /// Current operations width to vectorize. + unsigned BundleWidth = 0; + + /// Internal tree oprations proposed to be vectorized values use. + SmallDenseMap InternalTreeUses; + + /// Attaches the BlockScheduling structures to basic blocks. + MapVector> BlocksSchedules; + + /// A map of scalar integer values to the smallest bit width with which they + /// can legally be represented. The values map to (width, signed) pairs, + /// where "width" indicates the minimum bit width and "signed" is True if + /// the value must be signed-extended, rather than zero-extended, back to + /// its original width. + MapVector> MinBWs; + + /// Clear the internal data structures that are created by 'buildTree'. + void deleteTree() { + VectorizableTree.clear(); + ScalarToTreeEntry.clear(); + MustGather.clear(); + ExternalUses.clear(); + InternalTreeUses.clear(); + RemovedOperations.clear(); + NumOpsWantToKeepOrder.clear(); + NumOpsWantToKeepOriginalOrder = 0; + for (auto &Iter : BlocksSchedules) { + BlockScheduling *BS = Iter.second.get(); + BS->clear(); + } + MinBWs.clear(); + NoCallInst = true; + RawTreeCost = 0; + TreeCost = 0; + PartialCost = 0; + IsCostSumReady = false; + } + }; + + // Previous trees that might be worth to vectorize. + TreeState::TreeStateTy BuiltTrees; + + // Current tree that we consider. + TreeState *Tree = nullptr; // Analysis and block reference. Function *F; @@ -2407,7 +2554,7 @@ }; static NodeRef getEntryNode(BoUpSLP &R) { - return R.VectorizableTree[0].get(); + return R.Tree->VectorizableTree[0].get(); } static ChildIteratorType child_begin(NodeRef N) { @@ -2435,14 +2582,14 @@ }; static nodes_iterator nodes_begin(BoUpSLP *R) { - return nodes_iterator(R->VectorizableTree.begin()); + return nodes_iterator(R->Tree->VectorizableTree.begin()); } static nodes_iterator nodes_end(BoUpSLP *R) { - return nodes_iterator(R->VectorizableTree.end()); + return nodes_iterator(R->Tree->VectorizableTree.end()); } - static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); } + static unsigned size(BoUpSLP *R) { return R->Tree->VectorizableTree.size(); } }; template <> struct DOTGraphTraits : public DefaultDOTGraphTraits { @@ -2460,7 +2607,7 @@ for (auto V : Entry->Scalars) { OS << *V; if (std::any_of( - R->ExternalUses.begin(), R->ExternalUses.end(), + R->Tree->ExternalUses.begin(), R->Tree->ExternalUses.end(), [&](const BoUpSLP::ExternalUser &EU) { return EU.Scalar == V; })) OS << " "; OS << "\n"; @@ -2512,14 +2659,14 @@ void BoUpSLP::buildTree(ArrayRef Roots, ExtraValueToDebugLocsMap &ExternallyUsedValues, ArrayRef UserIgnoreLst) { - deleteTree(); + Tree->deleteTree(); UserIgnoreList = UserIgnoreLst; if (!allSameType(Roots)) return; buildTree_rec(Roots, 0, EdgeInfo()); // Collect the values that we need to extract from the tree. - for (auto &TEPtr : VectorizableTree) { + for (std::unique_ptr &TEPtr : Tree->VectorizableTree) { TreeEntry *Entry = TEPtr.get(); // No need to handle users of gathered values. @@ -2541,7 +2688,7 @@ if (ExtI != ExternallyUsedValues.end()) { LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane " << Lane << " from " << *Scalar << ".\n"); - ExternalUses.emplace_back(Scalar, nullptr, FoundLane); + Tree->ExternalUses.emplace_back(Scalar, nullptr, FoundLane); } for (User *U : Scalar->users()) { LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n"); @@ -2556,11 +2703,14 @@ // Some in-tree scalars will remain as scalar in vectorized // instructions. If that is the case, the one in Lane 0 will // be used. + Tree->InternalTreeUses[U].emplace_back(Scalar, U, FoundLane); if (UseScalar != U || !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) { LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U << ".\n"); - assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state"); + assert((UseEntry->State == TreeEntry::Vectorize || + UseEntry->State == TreeEntry::ScatterVectorize) && + "Bad state"); continue; } } @@ -2571,7 +2721,7 @@ LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " << Lane << " from " << *Scalar << ".\n"); - ExternalUses.push_back(ExternalUser(Scalar, U, FoundLane)); + Tree->ExternalUses.push_back(ExternalUser(Scalar, U, FoundLane)); } } } @@ -2655,7 +2805,7 @@ // we need to gather the scalars. // The reduction nodes (stored in UserIgnoreList) also should stay scalar. for (Value *V : VL) { - if (MustGather.count(V) || is_contained(UserIgnoreList, V)) { + if (Tree->MustGather.count(V) || is_contained(UserIgnoreList, V)) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); return; @@ -2699,7 +2849,7 @@ VL = UniqueValues; } - auto &BSRef = BlocksSchedules[BB]; + auto &BSRef = Tree->BlocksSchedules[BB]; if (!BSRef) BSRef = std::make_unique(BB); @@ -2764,14 +2914,14 @@ bool Reuse = canReuseExtract(VL, VL0, CurrentOrder); if (Reuse) { LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n"); - ++NumOpsWantToKeepOriginalOrder; + ++Tree->NumOpsWantToKeepOriginalOrder; newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); // This is a special case, as it does not gather, but at the same time // we are not extending buildTree_rec() towards the operands. ValueList Op0; Op0.assign(VL.size(), VL0->getOperand(0)); - VectorizableTree.back()->setOperand(0, Op0); + Tree->VectorizableTree.back()->setOperand(0, Op0); return; } if (!CurrentOrder.empty()) { @@ -2787,12 +2937,12 @@ newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies, CurrentOrder); findRootOrder(CurrentOrder); - ++NumOpsWantToKeepOrder[CurrentOrder]; + ++Tree->NumOpsWantToKeepOrder[CurrentOrder]; // This is a special case, as it does not gather, but at the same time // we are not extending buildTree_rec() towards the operands. ValueList Op0; Op0.assign(VL.size(), VL0->getOperand(0)); - VectorizableTree.back()->setOperand(0, Op0); + Tree->VectorizableTree.back()->setOperand(0, Op0); return; } LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n"); @@ -2857,7 +3007,7 @@ if (Diff && Diff->getAPInt() == (VL.size() - 1) * Size) { if (CurrentOrder.empty()) { // Original loads are consecutive and does not require reordering. - ++NumOpsWantToKeepOriginalOrder; + ++Tree->NumOpsWantToKeepOriginalOrder; TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); TE->setOperandsInOrder(); @@ -2870,7 +3020,7 @@ TE->setOperandsInOrder(); LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n"); findRootOrder(CurrentOrder); - ++NumOpsWantToKeepOrder[CurrentOrder]; + ++Tree->NumOpsWantToKeepOrder[CurrentOrder]; } return; } @@ -3126,7 +3276,7 @@ if (Diff && Diff->getAPInt() == (VL.size() - 1) * Size) { if (CurrentOrder.empty()) { // Original stores are consecutive and does not require reordering. - ++NumOpsWantToKeepOriginalOrder; + ++Tree->NumOpsWantToKeepOriginalOrder; TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); TE->setOperandsInOrder(); @@ -3140,7 +3290,7 @@ buildTree_rec(Operands, Depth + 1, {TE, 0}); LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n"); findRootOrder(CurrentOrder); - ++NumOpsWantToKeepOrder[CurrentOrder]; + ++Tree->NumOpsWantToKeepOrder[CurrentOrder]; } return; } @@ -3279,6 +3429,64 @@ } } +void BoUpSLP::cutTree() { + SmallVector VecNodes; + + for (std::unique_ptr &TEPtr : Tree->VectorizableTree) { + TreeEntry *Entry = TEPtr.get(); + if (Entry->State != TreeEntry::Vectorize && + Entry->State != TreeEntry::ScatterVectorize) + continue; + // For all canceled operations we should consider the possibility of + // use by with non-canceled operations and for that, it requires + // to populate ExternalUser list with canceled elements. + for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { + Value *Scalar = Entry->Scalars[Lane]; + for (User *U : Scalar->users()) { + LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n"); + TreeEntry *UserTE = getTreeEntry(U); + if (!UserTE || UserTE->State != TreeEntry::ProposedToGather) + continue; + // Ignore users in the user ignore list. + auto *UserInst = dyn_cast(U); + if (!UserInst) + continue; + + if (is_contained(UserIgnoreList, UserInst)) + continue; + LLVM_DEBUG(dbgs() << "SLP: Need extract to canceled operation :" << *U + << " from lane " << Lane << " from " << *Scalar + << ".\n"); + Tree->ExternalUses.emplace_back(Scalar, U, Lane); + } + } + } + // Canceling unprofitable elements. + for (std::unique_ptr &TEPtr : Tree->VectorizableTree) { + TreeEntry *Entry = TEPtr.get(); + if (Entry->State != TreeEntry::ProposedToGather) + continue; + Entry->State = TreeEntry::NeedToGather; + for (Value *V : Entry->Scalars) { + Tree->ScalarToTreeEntry.erase(V); +#ifndef NDEBUG + LLVM_DEBUG(dbgs() << "SLP: Remove scalar " << *V + << " out of proposed to vectorize.\n"); +#endif + } + } +} + +bool BoUpSLP::tryPartialVectorization() { + if (BuiltTrees.size() == 1) + return false; + Tree = BuiltTrees.front().get(); + vectorizeTree(); + LLVM_DEBUG(dbgs() << "SLP: Decided to partially vectorize tree with cost: " + << Tree->PartialCost << ".\n"); + return true; +} + unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const { unsigned N = 1; Type *EltTy = T; @@ -3380,7 +3588,7 @@ bool BoUpSLP::areAllUsersVectorized(Instruction *I) const { return I->hasOneUse() || std::all_of(I->user_begin(), I->user_end(), [this](User *U) { - return ScalarToTreeEntry.count(U) > 0; + return Tree->ScalarToTreeEntry.count(U) > 0; }); } @@ -3437,7 +3645,8 @@ ReuseShuffleCost = TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy); } - if (E->State == TreeEntry::NeedToGather) { + if (E->State != TreeEntry::Vectorize && + E->State != TreeEntry::ScatterVectorize) { if (allConstant(VL)) return 0; if (isSplat(VL)) { @@ -3455,7 +3664,7 @@ // instruction as dead and remove its cost from the final cost of the // vectorized tree. if (areAllUsersVectorized(cast(V)) && - !ScalarToTreeEntry.count(V)) { + !Tree->ScalarToTreeEntry.count(V)) { auto *IO = cast( cast(V)->getIndexOperand()); Cost -= TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, @@ -3833,25 +4042,26 @@ bool BoUpSLP::isFullyVectorizableTinyTree() const { LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height " - << VectorizableTree.size() << " is fully vectorizable .\n"); + << Tree->VectorizableTree.size() + << " is fully vectorizable .\n"); // We only handle trees of heights 1 and 2. - if (VectorizableTree.size() == 1 && - VectorizableTree[0]->State == TreeEntry::Vectorize) + if (Tree->VectorizableTree.size() == 1 && + Tree->VectorizableTree[0]->State == TreeEntry::Vectorize) return true; - if (VectorizableTree.size() != 2) + if (Tree->VectorizableTree.size() != 2) return false; // Handle splat and all-constants stores. - if (VectorizableTree[0]->State == TreeEntry::Vectorize && - (allConstant(VectorizableTree[1]->Scalars) || - isSplat(VectorizableTree[1]->Scalars))) + if (Tree->VectorizableTree[0]->State == TreeEntry::Vectorize && + (allConstant(Tree->VectorizableTree[1]->Scalars) || + isSplat(Tree->VectorizableTree[1]->Scalars))) return true; // Gathering cost would be too much for tiny trees. - if (VectorizableTree[0]->State == TreeEntry::NeedToGather || - VectorizableTree[1]->State == TreeEntry::NeedToGather) + if (Tree->VectorizableTree[0]->State != TreeEntry::Vectorize || + Tree->VectorizableTree[1]->State != TreeEntry::Vectorize) return false; return true; @@ -3895,16 +4105,16 @@ if (RdxOpcode != Instruction::Or) return false; - unsigned NumElts = VectorizableTree[0]->Scalars.size(); - Value *FirstReduced = VectorizableTree[0]->Scalars[0]; + unsigned NumElts = Tree->VectorizableTree[0]->Scalars.size(); + Value *FirstReduced = Tree->VectorizableTree[0]->Scalars[0]; return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI); } bool BoUpSLP::isLoadCombineCandidate() const { // Peek through a final sequence of stores and check if all operations are // likely to be load-combined. - unsigned NumElts = VectorizableTree[0]->Scalars.size(); - for (Value *Scalar : VectorizableTree[0]->Scalars) { + unsigned NumElts = Tree->VectorizableTree[0]->Scalars.size(); + for (Value *Scalar : Tree->VectorizableTree[0]->Scalars) { Value *X; if (!match(Scalar, m_Store(m_Value(X), m_Value())) || !isLoadCombineCandidateImpl(X, NumElts, TTI)) @@ -3916,7 +4126,7 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() const { // We can vectorize the tree if its size is greater than or equal to the // minimum size specified by the MinTreeSize command line option. - if (VectorizableTree.size() >= MinTreeSize) + if (Tree->VectorizableTree.size() >= MinTreeSize) return false; // If we have a tiny tree (a tree whose size is less than MinTreeSize), we @@ -3924,8 +4134,8 @@ if (isFullyVectorizableTinyTree()) return false; - assert(VectorizableTree.empty() - ? ExternalUses.empty() + assert(Tree->VectorizableTree.empty() + ? Tree->ExternalUses.empty() : true && "We shouldn't have any external users"); // Otherwise, we can't vectorize the tree. It is both tiny and not fully @@ -3933,12 +4143,11 @@ return true; } -int BoUpSLP::getSpillCost() const { +int BoUpSLP::getSpillCost() { // Walk from the bottom of the tree to the top, tracking which values are // live. When we see a call instruction that is not part of our tree, // query TTI to see if there is a cost to keeping values live over it // (for example, if spills and fills are required). - unsigned BundleWidth = VectorizableTree.front()->Scalars.size(); int Cost = 0; SmallPtrSet LiveValues; @@ -3951,7 +4160,7 @@ // their order does not matter, as long as all instructions in a basic block // are grouped together. Using dominance ensures a deterministic order. SmallVector OrderedScalars; - for (const auto &TEPtr : VectorizableTree) { + for (const std::unique_ptr &TEPtr : Tree->VectorizableTree) { Instruction *Inst = dyn_cast(TEPtr->Scalars[0]); if (!Inst) continue; @@ -3970,7 +4179,8 @@ // Update LiveValues. LiveValues.erase(PrevInst); for (auto &J : PrevInst->operands()) { - if (isa(&*J) && getTreeEntry(&*J)) + if (isa(&*J) && + Tree->ScalarToTreeEntry.find(&*J) != Tree->ScalarToTreeEntry.end()) LiveValues.insert(cast(&*J)); } @@ -4003,9 +4213,10 @@ } if (NumCalls) { + Tree->NoCallInst = false; SmallVector V; for (auto *II : LiveValues) - V.push_back(FixedVectorType::get(II->getType(), BundleWidth)); + V.push_back(FixedVectorType::get(II->getType(), Tree->BundleWidth)); Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V); } @@ -4015,15 +4226,169 @@ return Cost; } -int BoUpSLP::getTreeCost() { - int Cost = 0; - LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size " - << VectorizableTree.size() << ".\n"); +int BoUpSLP::getExtractOperationCost(const ExternalUser &EU) const { + // Uses by ephemeral values are free (because the ephemeral value will be + // removed prior to code generation, and so the extraction will be + // removed as well). + if (EphValues.count(EU.User)) + return 0; + + // If we plan to rewrite the tree in a smaller type, we will need to sign + // extend the extracted value back to the original type. Here, we account + // for the extract and the added cost of the sign extend if needed. + auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), Tree->BundleWidth); + Value *ScalarRoot = Tree->VectorizableTree.front()->Scalars[0]; + + auto It = MinBWs.find(ScalarRoot); + if (It != MinBWs.end()) { + uint64_t Width = It->second.first; + bool Signed = It->second.second; + auto *MinTy = IntegerType::get(F->getContext(), Width); + unsigned ExtOp = Signed ? Instruction::SExt : Instruction::ZExt; + VecTy = FixedVectorType::get(MinTy, Tree->BundleWidth); + return (TTI->getExtractWithExtendCost(ExtOp, EU.Scalar->getType(), VecTy, + EU.Lane)); + } + return TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane); +} + +int BoUpSLP::getExtractCost() const { + int ExtractCost = 0; + SmallPtrSet ExtractCostCalculated; + // Consider the possibility of extracting vectorized + // values for canceled elements use. + for (const std::unique_ptr &TEPtr : Tree->VectorizableTree) { + TreeEntry *Entry = TEPtr.get(); + if (Entry->State != TreeEntry::ProposedToGather) + continue; + for (Value *V : Entry->Scalars) { + // Consider the possibility of extracting vectorized + // values for canceled elements use. + auto It = Tree->InternalTreeUses.find(V); + if (It != Tree->InternalTreeUses.end()) { + const UserList &UL = It->second; + for (const ExternalUser &IU : UL) + ExtractCost += getExtractOperationCost(IU); + } + } + } + for (const ExternalUser &EU : Tree->ExternalUses) { + // We only add extract cost once for the same scalar. + if (!ExtractCostCalculated.insert(EU.Scalar).second) + continue; + + int Cost = getExtractOperationCost(EU); + ExtractCost += Cost; + } + return ExtractCost; +} + +int BoUpSLP::getInsertCost() { + int InsertCost = 0; + for (const std::unique_ptr &TEPtr : Tree->VectorizableTree) { + TreeEntry *Entry = TEPtr.get(); + // Avoid already vectorized TreeEntries, it is already in a vector form and + // we don't need to gather those operations. + if (Entry->State != TreeEntry::ProposedToGather) + continue; + for (Value *V : Entry->Scalars) { + auto *Inst = cast(V); + if (llvm::any_of(Inst->users(), [this](User *Op) { + return Tree->ScalarToTreeEntry.count(Op) > 0; + })) { + InsertCost += getEntryCost(Entry); + break; + } + } + } + return InsertCost; +} + +void BoUpSLP::treeTraversal() { + SmallVector Worklist; + SmallSet Visited; + + Worklist.push_back(0); + // Does a BFS on non-gathering nodes of SLP graph and since the graph might + // have cycles we have to mark each visited node to process such tree. + Visited.insert(0); + do { + unsigned Current = Worklist.back(); + Worklist.pop_back(); + for (TreeEntry *TE : Tree->VectorizableTree[Current]->UseEntries) { + unsigned Next = TE->Idx; + if (Next == Current) + continue; + if (Tree->VectorizableTree[Next]->NeedToGather) + continue; + if (!Visited.count(Next)) { + Tree->VectorizableTree[Next]->Distance = + Tree->VectorizableTree[Current]->Distance + 1; + Worklist.push_back(Next); + Visited.insert(Next); + } + } + } while (!Worklist.empty()); +} + +bool BoUpSLP::findSubTree(int UserCost) { + treeTraversal(); + auto Cmp = [](const TreeEntry *LHS, const TreeEntry *RHS) { + return LHS->Distance > RHS->Distance; + }; + std::set Vec(Cmp); + for (const std::unique_ptr &TEPtr : Tree->VectorizableTree) { + TreeEntry *Entry = TEPtr.get(); + // Ignore and non-vectoriable entries, entries with a low cost, + // or simple entries with just one operand or less. + if ((Entry->State != TreeEntry::Vectorize && + Entry->State != TreeEntry::ScatterVectorize) || + Entry->Cost <= 0 || + !Entry->Idx) + continue; + Vec.insert(Entry); + } + if (Vec.size() > MaxCostsRecalculations) { + std::set::iterator It = + Vec.begin(); + std::advance(It, (unsigned)MaxCostsRecalculations); + Vec.erase(It, Vec.end()); + } + + int Sum = 0; + for (TreeEntry *Entry : Vec) + Sum += Entry->Cost; + // Avoid reducing the tree if there is no potential room to reduce. + if ((Tree->TreeCost - UserCost - Sum) >= -SLPCostThreshold) + return false; + + for (TreeEntry *T : Vec) { + T->State = TreeEntry::ProposedToGather; + for (Value *V : T->Scalars) { + Tree->MustGather.insert(V); + Tree->ExternalUses.erase( + llvm::remove_if(Tree->ExternalUses, + [V](ExternalUser &EU) { return EU.Scalar == V; }), + Tree->ExternalUses.end()); + } + Tree->PartialCost = getTreeCost() - UserCost; + Tree->RemovedOperations.push_back(T); + if (Tree->PartialCost < -SLPCostThreshold) { + cutTree(); + return true; + } + } + return false; +} - unsigned BundleWidth = VectorizableTree[0]->Scalars.size(); +int BoUpSLP::getRawTreeCost() { + int CostSum = 0; + Tree->BundleWidth = Tree->VectorizableTree.front()->Scalars.size(); + LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size " + << Tree->VectorizableTree.size() << ".\n"); - for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) { - TreeEntry &TE = *VectorizableTree[I].get(); + for (std::unique_ptr &TEPtr : Tree->VectorizableTree) { + TreeEntry &TE = *TEPtr.get(); // We create duplicate tree entries for gather sequences that have multiple // uses. However, we should not compute the cost of duplicate sequences. @@ -4037,65 +4402,74 @@ // their uses. Since such an approach results in fewer total entries, // existing heuristics based on tree size may yield different results. // - if (TE.State == TreeEntry::NeedToGather && - std::any_of(std::next(VectorizableTree.begin(), I + 1), - VectorizableTree.end(), - [TE](const std::unique_ptr &EntryPtr) { - return EntryPtr->State == TreeEntry::NeedToGather && - EntryPtr->isSame(TE.Scalars); - })) + if (TE.State != TreeEntry::Vectorize && + TE.State != TreeEntry::ScatterVectorize && + llvm::any_of(llvm::drop_begin(Tree->VectorizableTree, TE.Idx + 1), + [TE](const std::unique_ptr &EntryPtr) { + return EntryPtr->State != TreeEntry::Vectorize && + EntryPtr->State != TreeEntry::ScatterVectorize && + EntryPtr->isSame(TE.Scalars); + })) continue; - int C = getEntryCost(&TE); - Cost += C; - LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C + TE.Cost = getEntryCost(&TE); + LLVM_DEBUG(dbgs() << "SLP: Adding cost " << TE.Cost << " for bundle that starts with " << *TE.Scalars[0] - << ".\n" - << "SLP: Current total cost = " << Cost << "\n"); + << ".\n"); + CostSum += TE.Cost; + LLVM_DEBUG(dbgs() << "SLP: Current total cost = " << CostSum << "\n"); } - SmallPtrSet ExtractCostCalculated; - int ExtractCost = 0; - for (ExternalUser &EU : ExternalUses) { - // We only add extract cost once for the same scalar. - if (!ExtractCostCalculated.insert(EU.Scalar).second) - continue; + if (SLPThrottling) + for (std::unique_ptr &TEPtr : Tree->VectorizableTree) { + TreeEntry *TE = TEPtr.get(); + if (TE->State != TreeEntry::Vectorize && + TE->State != TreeEntry::ScatterVectorize) + continue; + int GatherCost = 0; + for (TreeEntry *Gather : TE->UseEntries) + if (Gather->State != TreeEntry::Vectorize && + Gather->State != TreeEntry::ScatterVectorize) + GatherCost += Gather->Cost; + TE->Cost += GatherCost; + } + return CostSum; +} - // Uses by ephemeral values are free (because the ephemeral value will be - // removed prior to code generation, and so the extraction will be - // removed as well). - if (EphValues.count(EU.User)) - continue; +int BoUpSLP::getTreeCost() { + int CostSum; + if (!Tree->IsCostSumReady) { + CostSum = getRawTreeCost(); + Tree->RawTreeCost = CostSum; + } else { + CostSum = Tree->RawTreeCost; + } - // If we plan to rewrite the tree in a smaller type, we will need to sign - // extend the extracted value back to the original type. Here, we account - // for the extract and the added cost of the sign extend if needed. - auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth); - auto *ScalarRoot = VectorizableTree[0]->Scalars[0]; - if (MinBWs.count(ScalarRoot)) { - auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first); - auto Extend = - MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt; - VecTy = FixedVectorType::get(MinTy, BundleWidth); - ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(), - VecTy, EU.Lane); - } else { - ExtractCost += - TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane); + if (SLPThrottling) + for (std::unique_ptr &TEPtr : Tree->VectorizableTree) { + TreeEntry *TE = TEPtr.get(); + if (TE->State == TreeEntry::ProposedToGather) + CostSum -= TE->Cost; } - } - int SpillCost = getSpillCost(); - Cost += SpillCost + ExtractCost; + int ExtractCost = getExtractCost(); + int SpillCost = 0; + if (!Tree->NoCallInst || !Tree->IsCostSumReady) + SpillCost = getSpillCost(); + assert((!Tree->NoCallInst || getSpillCost() == 0) && "Incorrect spill cost"); + if (!Tree->IsCostSumReady) + Tree->IsCostSumReady = true; + int InsertCost = getInsertCost(); + int Cost = CostSum + ExtractCost + SpillCost + InsertCost; + Tree->TreeCost = Cost; #ifndef NDEBUG SmallString<256> Str; - { - raw_svector_ostream OS(Str); - OS << "SLP: Spill Cost = " << SpillCost << ".\n" - << "SLP: Extract Cost = " << ExtractCost << ".\n" - << "SLP: Total Cost = " << Cost << ".\n"; - } + raw_svector_ostream OS(Str); + OS << "SLP: Spill Cost = " << SpillCost << ".\n" + << "SLP: Extract Cost = " << ExtractCost << ".\n" + << "SLP: Insert Cost = " << InsertCost << ".\n" + << "SLP: Total Cost = " << Cost << ".\n"; LLVM_DEBUG(dbgs() << Str); if (ViewSLPTree) ViewGraph(this, "SLP" + F->getName(), false, Str); @@ -4173,9 +4547,9 @@ // scheduled, and the last instruction is VL.back(). So we start with // VL.back() and iterate over schedule data until we reach the end of the // bundle. The end of the bundle is marked by null ScheduleData. - if (BlocksSchedules.count(BB)) { - auto *Bundle = - BlocksSchedules[BB]->getScheduleData(E->isOneOf(E->Scalars.back())); + if (Tree->BlocksSchedules.count(BB)) { + auto *Bundle = Tree->BlocksSchedules[BB]->getScheduleData( + E->isOneOf(E->Scalars.back())); if (Bundle && Bundle->isPartOfBundle()) for (; Bundle; Bundle = Bundle->NextInBundle) if (Bundle->OpValue == Bundle->Inst) @@ -4240,7 +4614,7 @@ FoundLane = std::distance(Entry->ReuseShuffleIndices.begin(), find(Entry->ReuseShuffleIndices, FoundLane)); } - ExternalUses.push_back(ExternalUser(Val, InsElt, FoundLane)); + Tree->ExternalUses.push_back(ExternalUser(Val, InsElt, FoundLane)); } } @@ -4613,7 +4987,7 @@ // to ExternalUses list to make sure that an extract will be generated // in the future. if (getTreeEntry(PO)) - ExternalUses.emplace_back(PO, cast(VecPtr), 0); + Tree->ExternalUses.emplace_back(PO, cast(VecPtr), 0); NewLI = Builder.CreateAlignedLoad(VecTy, VecPtr, LI->getAlign()); } else { @@ -4657,7 +5031,7 @@ // ExternalUses to make sure that an extract will be generated in the // future. if (getTreeEntry(ScalarPtr)) - ExternalUses.push_back(ExternalUser(ScalarPtr, cast(VecPtr), 0)); + Tree->ExternalUses.emplace_back(ScalarPtr, cast(VecPtr), 0); Value *V = propagateMetadata(ST, E->Scalars); @@ -4757,7 +5131,7 @@ // call to ExternalUses list to make sure that an extract will be // generated in the future. if (ScalarArg && getTreeEntry(ScalarArg)) - ExternalUses.push_back(ExternalUser(ScalarArg, cast(V), 0)); + Tree->ExternalUses.emplace_back(ScalarArg, cast(V), 0); propagateIRFlags(V, E->Scalars, VL0); ShuffleBuilder.addMask(E->ReuseShuffleIndices); @@ -4849,28 +5223,39 @@ Value * BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { // All blocks must be scheduled before any instructions are inserted. - for (auto &BSIter : BlocksSchedules) { - scheduleBlock(BSIter.second.get()); + for (auto &BSIter : Tree->BlocksSchedules) { + BlockScheduling *BS = BSIter.second.get(); + // Remove all Schedule Data from all nodes that we have changed + // vectorization decision. + if (!Tree->RemovedOperations.empty()) + removeFromScheduling(BS); + scheduleBlock(BS); } Builder.SetInsertPoint(&F->getEntryBlock().front()); - auto *VectorRoot = vectorizeTree(VectorizableTree[0].get()); + auto *VectorRoot = vectorizeTree(Tree->VectorizableTree.front().get()); + + for (std::unique_ptr &TEPtr : Tree->VectorizableTree) { + TreeEntry *Entry = TEPtr.get(); + if (Entry->State == TreeEntry::Vectorize && !Entry->VectorizedValue) + vectorizeTree(Entry); + } // If the vectorized tree can be rewritten in a smaller type, we truncate the // vectorized root. InstCombine will then rewrite the entire expression. We // sign extend the extracted values below. - auto *ScalarRoot = VectorizableTree[0]->Scalars[0]; + auto *ScalarRoot = Tree->VectorizableTree[0]->Scalars[0]; if (MinBWs.count(ScalarRoot)) { if (auto *I = dyn_cast(VectorRoot)) Builder.SetInsertPoint(&*++BasicBlock::iterator(I)); - auto BundleWidth = VectorizableTree[0]->Scalars.size(); + Tree->BundleWidth = Tree->VectorizableTree[0]->Scalars.size(); auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first); - auto *VecTy = FixedVectorType::get(MinTy, BundleWidth); + auto *VecTy = FixedVectorType::get(MinTy, Tree->BundleWidth); auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy); - VectorizableTree[0]->VectorizedValue = Trunc; + Tree->VectorizableTree[0]->VectorizedValue = Trunc; } - LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() + LLVM_DEBUG(dbgs() << "SLP: Extracting " << Tree->ExternalUses.size() << " values .\n"); // If necessary, sign-extend or zero-extend ScalarRoot to the larger type @@ -4884,7 +5269,7 @@ }; // Extract all of the elements with the external uses. - for (const auto &ExternalUse : ExternalUses) { + for (const auto &ExternalUse : Tree->ExternalUses) { Value *Scalar = ExternalUse.Scalar; llvm::User *User = ExternalUse.User; @@ -4964,7 +5349,7 @@ } // For each vectorized value: - for (auto &TEPtr : VectorizableTree) { + for (std::unique_ptr &TEPtr : Tree->VectorizableTree) { TreeEntry *Entry = TEPtr.get(); // No need to handle users of gathered values. @@ -4979,7 +5364,9 @@ #ifndef NDEBUG Type *Ty = Scalar->getType(); - if (!Ty->isVoidTy()) { + // The tree might not be fully vectorized, so we don't have to + // check every user. + if (!Ty->isVoidTy() && Tree->RemovedOperations.empty()) { for (User *U : Scalar->users()) { LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n"); @@ -4997,7 +5384,13 @@ Builder.ClearInsertionPoint(); InstrElementSize.clear(); - return VectorizableTree[0]->VectorizedValue; + // Erase all saved trees after vectorization, except the current. + llvm::erase_if( + BuiltTrees, + [this](std::unique_ptr &T) { return T.get() != Tree; }), + BuiltTrees.end(); + + return Tree->VectorizableTree[0]->VectorizedValue; } void BoUpSLP::optimizeGatherSequence() { @@ -5468,6 +5861,31 @@ ReadyInsts.clear(); } +void BoUpSLP::removeFromScheduling(BlockScheduling *BS) { + bool Removed = false; + for (TreeEntry *Entry : Tree->RemovedOperations) { + ScheduleData *SD = BS->getScheduleData(Entry->Scalars[0]); + if (SD && SD->isPartOfBundle()) { + if (!Removed) { + Removed = true; + BS->resetSchedule(); + } + BS->cancelScheduling(Entry->Scalars, SD->OpValue); + } + } + if (!Removed) + return; + BS->resetSchedule(); + BS->initialFillReadyList(BS->ReadyInsts); + for (Instruction *I = BS->ScheduleStart; I != BS->ScheduleEnd; + I = I->getNextNode()) { + if (BS->ScheduleDataMap.find(I) == BS->ScheduleDataMap.end()) + continue; + BS->doForAllOpcodes(I, + [](ScheduleData *SD) { SD->clearDependencies(); }); + } +} + void BoUpSLP::scheduleBlock(BlockScheduling *BS) { if (!BS->ScheduleStart) return; @@ -5677,11 +6095,11 @@ void BoUpSLP::computeMinimumValueSizes() { // If there are no external uses, the expression tree must be rooted by a // store. We can't demote in-memory values, so there is nothing to do here. - if (ExternalUses.empty()) + if (Tree->ExternalUses.empty()) return; // We only attempt to truncate integer expressions. - auto &TreeRoot = VectorizableTree[0]->Scalars; + auto &TreeRoot = Tree->VectorizableTree[0]->Scalars; auto *TreeRootIT = dyn_cast(TreeRoot[0]->getType()); if (!TreeRootIT) return; @@ -5693,7 +6111,7 @@ // must have multiple uses and InstCombine will not rewrite it. The code // below ensures that only the roots are used externally. SmallPtrSet Expr(TreeRoot.begin(), TreeRoot.end()); - for (auto &EU : ExternalUses) + for (auto &EU : Tree->ExternalUses) if (!Expr.erase(EU.Scalar)) return; if (!Expr.empty()) @@ -5702,7 +6120,7 @@ // Collect the scalar values of the vectorizable expression. We will use this // context to determine which values can be demoted. If we see a truncation, // we mark it as seeding another demotion. - for (auto &EntryPtr : VectorizableTree) + for (auto &EntryPtr : Tree->VectorizableTree) Expr.insert(EntryPtr->Scalars.begin(), EntryPtr->Scalars.end()); // Ensure the roots of the vectorizable tree don't form a cycle. They must @@ -5948,6 +6366,12 @@ << " underlying objects.\n"); Changed |= vectorizeGEPIndices(BB, R); } + + // Partially vectorize trees after all full vectorization is done, + // otherwise, we could prevent more profitable full vectorization with + // smaller vector sizes. + if (SLPThrottling) + Changed |= R.tryPartialVectorization(); } if (Changed) { @@ -6005,6 +6429,8 @@ R.vectorizeTree(); return true; } + if (SLPThrottling && R.findSubTree()) + R.saveTree(); return false; } @@ -6248,6 +6674,7 @@ R.computeMinimumValueSizes(); int Cost = R.getTreeCost(); + unsigned UserCost = 0; CandidateFound = true; if (CompensateUseCost) { // TODO: Use TTI's getScalarizationOverhead for sequence of inserts @@ -6277,7 +6704,6 @@ // Switching to the TTI interface might help a bit. // Alternative solution could be pattern-match to detect a no-op or // shuffle. - unsigned UserCost = 0; for (unsigned Lane = 0; Lane < OpsWidth; Lane++) { auto *IE = cast(InsertUses[I + Lane]); if (auto *CI = dyn_cast(IE->getOperand(2))) @@ -6304,6 +6730,8 @@ I += VF - 1; NextInst = I + 1; Changed = true; + } else if (SLPThrottling && R.findSubTree(UserCost)) { + R.saveTree(); } } } Index: llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll +++ llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll @@ -204,11 +204,15 @@ ; MAX-COST-LABEL: @PR32038( ; MAX-COST-NEXT: entry: ; MAX-COST-NEXT: [[TMP0:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <2 x i8>*), align 1 -; MAX-COST-NEXT: [[TMP1:%.*]] = icmp eq <2 x i8> [[TMP0]], zeroinitializer ; MAX-COST-NEXT: [[P4:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 3), align 1 -; MAX-COST-NEXT: [[P5:%.*]] = icmp eq i8 [[P4]], 0 ; MAX-COST-NEXT: [[P6:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 4), align 4 -; MAX-COST-NEXT: [[P7:%.*]] = icmp eq i8 [[P6]], 0 +; MAX-COST-NEXT: [[TMP1:%.*]] = extractelement <2 x i8> [[TMP0]], i32 0 +; MAX-COST-NEXT: [[TMP2:%.*]] = insertelement <4 x i8> undef, i8 [[TMP1]], i32 0 +; MAX-COST-NEXT: [[TMP3:%.*]] = extractelement <2 x i8> [[TMP0]], i32 1 +; MAX-COST-NEXT: [[TMP4:%.*]] = insertelement <4 x i8> [[TMP2]], i8 [[TMP3]], i32 1 +; MAX-COST-NEXT: [[TMP5:%.*]] = insertelement <4 x i8> [[TMP4]], i8 [[P4]], i32 2 +; MAX-COST-NEXT: [[TMP6:%.*]] = insertelement <4 x i8> [[TMP5]], i8 [[P6]], i32 3 +; MAX-COST-NEXT: [[TMP7:%.*]] = icmp eq <4 x i8> [[TMP6]], zeroinitializer ; MAX-COST-NEXT: [[P8:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1 ; MAX-COST-NEXT: [[P9:%.*]] = icmp eq i8 [[P8]], 0 ; MAX-COST-NEXT: [[P10:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2 @@ -220,19 +224,21 @@ ; MAX-COST-NEXT: br label [[FOR_BODY:%.*]] ; MAX-COST: for.body: ; MAX-COST-NEXT: [[P17:%.*]] = phi i32 [ [[P34:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; MAX-COST-NEXT: [[TMP2:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0 -; MAX-COST-NEXT: [[TMP3:%.*]] = insertelement <4 x i1> undef, i1 [[TMP2]], i32 0 -; MAX-COST-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1 -; MAX-COST-NEXT: [[TMP5:%.*]] = insertelement <4 x i1> [[TMP3]], i1 [[TMP4]], i32 1 -; MAX-COST-NEXT: [[TMP6:%.*]] = insertelement <4 x i1> [[TMP5]], i1 [[P5]], i32 2 -; MAX-COST-NEXT: [[TMP7:%.*]] = insertelement <4 x i1> [[TMP6]], i1 [[P7]], i32 3 -; MAX-COST-NEXT: [[TMP8:%.*]] = select <4 x i1> [[TMP7]], <4 x i32> , <4 x i32> +; MAX-COST-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP7]], i32 3 +; MAX-COST-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP7]], i32 0 +; MAX-COST-NEXT: [[TMP10:%.*]] = insertelement <4 x i1> undef, i1 [[TMP9]], i32 0 +; MAX-COST-NEXT: [[TMP11:%.*]] = extractelement <4 x i1> [[TMP7]], i32 1 +; MAX-COST-NEXT: [[TMP12:%.*]] = insertelement <4 x i1> [[TMP10]], i1 [[TMP11]], i32 1 +; MAX-COST-NEXT: [[TMP13:%.*]] = extractelement <4 x i1> [[TMP7]], i32 2 +; MAX-COST-NEXT: [[TMP14:%.*]] = insertelement <4 x i1> [[TMP12]], i1 [[TMP13]], i32 2 +; MAX-COST-NEXT: [[TMP15:%.*]] = insertelement <4 x i1> [[TMP14]], i1 [[TMP8]], i32 3 +; MAX-COST-NEXT: [[TMP16:%.*]] = select <4 x i1> [[TMP15]], <4 x i32> , <4 x i32> ; MAX-COST-NEXT: [[P27:%.*]] = select i1 [[P9]], i32 -720, i32 -80 ; MAX-COST-NEXT: [[P29:%.*]] = select i1 [[P11]], i32 -720, i32 -80 -; MAX-COST-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP8]]) -; MAX-COST-NEXT: [[TMP10:%.*]] = add i32 [[TMP9]], [[P27]] -; MAX-COST-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], [[P29]] -; MAX-COST-NEXT: [[OP_EXTRA:%.*]] = add i32 [[TMP11]], -5 +; MAX-COST-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP16]]) +; MAX-COST-NEXT: [[TMP18:%.*]] = add i32 [[TMP17]], [[P27]] +; MAX-COST-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], [[P29]] +; MAX-COST-NEXT: [[OP_EXTRA:%.*]] = add i32 [[TMP19]], -5 ; MAX-COST-NEXT: [[P31:%.*]] = select i1 [[P13]], i32 -720, i32 -80 ; MAX-COST-NEXT: [[P32:%.*]] = add i32 [[OP_EXTRA]], [[P31]] ; MAX-COST-NEXT: [[P33:%.*]] = select i1 [[P15]], i32 -720, i32 -80 Index: llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll +++ llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll @@ -116,12 +116,15 @@ ; CHECK-NEXT: [[T7:%.*]] = insertelement <4 x float> undef, float [[T6]], i32 0 ; CHECK-NEXT: [[T8:%.*]] = lshr i64 [[T1]], 32 ; CHECK-NEXT: [[T9:%.*]] = trunc i64 [[T8]] to i32 -; CHECK-NEXT: [[T10:%.*]] = bitcast i32 [[T9]] to float -; CHECK-NEXT: [[T11:%.*]] = insertelement <4 x float> [[T7]], float [[T10]], i32 1 ; CHECK-NEXT: [[T12:%.*]] = trunc i64 [[T4]] to i32 -; CHECK-NEXT: [[T13:%.*]] = bitcast i32 [[T12]] to float -; CHECK-NEXT: [[T14:%.*]] = insertelement <4 x float> [[T11]], float [[T13]], i32 2 -; CHECK-NEXT: [[T15:%.*]] = insertelement <4 x float> [[T14]], float [[T13]], i32 3 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> undef, i32 [[T9]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[T12]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <2 x float> +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 +; CHECK-NEXT: [[T11:%.*]] = insertelement <4 x float> [[T7]], float [[TMP4]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 +; CHECK-NEXT: [[T14:%.*]] = insertelement <4 x float> [[T11]], float [[TMP5]], i32 2 +; CHECK-NEXT: [[T15:%.*]] = insertelement <4 x float> [[T14]], float [[TMP5]], i32 3 ; CHECK-NEXT: ret <4 x float> [[T15]] ; %t0 = bitcast <4 x float>* %x to i64* Index: llvm/test/Transforms/SLPVectorizer/X86/powof2div.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/powof2div.ll +++ llvm/test/Transforms/SLPVectorizer/X86/powof2div.ll @@ -60,35 +60,34 @@ define void @powof2div_nonuniform(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c){ ; AVX1-LABEL: @powof2div_nonuniform( ; AVX1-NEXT: entry: -; AVX1-NEXT: [[TMP0:%.*]] = load i32, i32* [[B:%.*]], align 4 -; AVX1-NEXT: [[TMP1:%.*]] = load i32, i32* [[C:%.*]], align 4 -; AVX1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]] -; AVX1-NEXT: [[DIV:%.*]] = sdiv i32 [[ADD]], 2 -; AVX1-NEXT: store i32 [[DIV]], i32* [[A:%.*]], align 4 -; AVX1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 1 -; AVX1-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4 -; AVX1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 1 -; AVX1-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX4]], align 4 -; AVX1-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP3]], [[TMP2]] -; AVX1-NEXT: [[DIV6:%.*]] = sdiv i32 [[ADD5]], 4 -; AVX1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 1 -; AVX1-NEXT: store i32 [[DIV6]], i32* [[ARRAYIDX7]], align 4 +; AVX1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1 +; AVX1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 1 ; AVX1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2 -; AVX1-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX8]], align 4 ; AVX1-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 2 -; AVX1-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX9]], align 4 -; AVX1-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP5]], [[TMP4]] -; AVX1-NEXT: [[DIV11:%.*]] = sdiv i32 [[ADD10]], 8 -; AVX1-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2 -; AVX1-NEXT: store i32 [[DIV11]], i32* [[ARRAYIDX12]], align 4 ; AVX1-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3 -; AVX1-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX13]], align 4 +; AVX1-NEXT: [[TMP0:%.*]] = bitcast i32* [[B]] to <4 x i32>* +; AVX1-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 ; AVX1-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3 -; AVX1-NEXT: [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX14]], align 4 -; AVX1-NEXT: [[ADD15:%.*]] = add nsw i32 [[TMP7]], [[TMP6]] -; AVX1-NEXT: [[DIV16:%.*]] = sdiv i32 [[ADD15]], 16 +; AVX1-NEXT: [[TMP2:%.*]] = bitcast i32* [[C]] to <4 x i32>* +; AVX1-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4 +; AVX1-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP1]] +; AVX1-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0 +; AVX1-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP5]], 2 +; AVX1-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1 +; AVX1-NEXT: [[DIV6:%.*]] = sdiv i32 [[TMP6]], 4 +; AVX1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 1 +; AVX1-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2 +; AVX1-NEXT: [[DIV11:%.*]] = sdiv i32 [[TMP7]], 8 +; AVX1-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2 +; AVX1-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3 +; AVX1-NEXT: [[DIV16:%.*]] = sdiv i32 [[TMP8]], 16 ; AVX1-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3 -; AVX1-NEXT: store i32 [[DIV16]], i32* [[ARRAYIDX17]], align 4 +; AVX1-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> undef, i32 [[DIV]], i32 0 +; AVX1-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[DIV6]], i32 1 +; AVX1-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[DIV11]], i32 2 +; AVX1-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[DIV16]], i32 3 +; AVX1-NEXT: [[TMP13:%.*]] = bitcast i32* [[A]] to <4 x i32>* +; AVX1-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* [[TMP13]], align 4 ; AVX1-NEXT: ret void ; ; AVX2-LABEL: @powof2div_nonuniform( Index: llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll +++ llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll @@ -175,43 +175,31 @@ ; ; AVX-LABEL: @gather_load_3( ; AVX-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, [[TBAA0]] -; AVX-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 1 -; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 -; AVX-NEXT: store i32 [[TMP4]], i32* [[TMP0]], align 4, [[TBAA0]] -; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; AVX-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, [[TBAA0]] +; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 ; AVX-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, [[TBAA0]] -; AVX-NEXT: [[TMP8:%.*]] = add i32 [[TMP7]], 2 -; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2 -; AVX-NEXT: store i32 [[TMP8]], i32* [[TMP5]], align 4, [[TBAA0]] -; AVX-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; AVX-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 +; AVX-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, [[TBAA0]] +; AVX-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 ; AVX-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, [[TBAA0]] -; AVX-NEXT: [[TMP12:%.*]] = add i32 [[TMP11]], 3 -; AVX-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3 -; AVX-NEXT: store i32 [[TMP12]], i32* [[TMP9]], align 4, [[TBAA0]] -; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 +; AVX-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 +; AVX-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4, [[TBAA0]] +; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 ; AVX-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, [[TBAA0]] -; AVX-NEXT: [[TMP16:%.*]] = add i32 [[TMP15]], 4 -; AVX-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 4 -; AVX-NEXT: store i32 [[TMP16]], i32* [[TMP13]], align 4, [[TBAA0]] -; AVX-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 -; AVX-NEXT: [[TMP19:%.*]] = load i32, i32* [[TMP18]], align 4, [[TBAA0]] -; AVX-NEXT: [[TMP20:%.*]] = add i32 [[TMP19]], 1 -; AVX-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5 -; AVX-NEXT: store i32 [[TMP20]], i32* [[TMP17]], align 4, [[TBAA0]] -; AVX-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 -; AVX-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4, [[TBAA0]] -; AVX-NEXT: [[TMP24:%.*]] = add i32 [[TMP23]], 2 -; AVX-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6 -; AVX-NEXT: store i32 [[TMP24]], i32* [[TMP21]], align 4, [[TBAA0]] -; AVX-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 -; AVX-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4, [[TBAA0]] -; AVX-NEXT: [[TMP28:%.*]] = add i32 [[TMP27]], 3 -; AVX-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7 -; AVX-NEXT: store i32 [[TMP28]], i32* [[TMP25]], align 4, [[TBAA0]] -; AVX-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 -; AVX-NEXT: [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4, [[TBAA0]] -; AVX-NEXT: [[TMP32:%.*]] = add i32 [[TMP31]], 4 -; AVX-NEXT: store i32 [[TMP32]], i32* [[TMP29]], align 4, [[TBAA0]] +; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 +; AVX-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4, [[TBAA0]] +; AVX-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> undef, i32 [[TMP3]], i32 0 +; AVX-NEXT: [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP5]], i32 1 +; AVX-NEXT: [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP7]], i32 2 +; AVX-NEXT: [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP9]], i32 3 +; AVX-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP11]], i32 4 +; AVX-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP13]], i32 5 +; AVX-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP15]], i32 6 +; AVX-NEXT: [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i32 7 +; AVX-NEXT: [[TMP26:%.*]] = add <8 x i32> [[TMP25]], +; AVX-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>* +; AVX-NEXT: store <8 x i32> [[TMP26]], <8 x i32>* [[TMP27]], align 4, [[TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_3( @@ -356,19 +344,12 @@ ; SSE-NEXT: ret void ; ; AVX-LABEL: @gather_load_4( -; AVX-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1 ; AVX-NEXT: [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11 -; AVX-NEXT: [[T9:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 2 ; AVX-NEXT: [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4 -; AVX-NEXT: [[T13:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 3 ; AVX-NEXT: [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15 -; AVX-NEXT: [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 4 ; AVX-NEXT: [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18 -; AVX-NEXT: [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5 ; AVX-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 -; AVX-NEXT: [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6 ; AVX-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6 -; AVX-NEXT: [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7 ; AVX-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21 ; AVX-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, [[TBAA0]] ; AVX-NEXT: [[T7:%.*]] = load i32, i32* [[T6]], align 4, [[TBAA0]] @@ -378,22 +359,17 @@ ; AVX-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, [[TBAA0]] ; AVX-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, [[TBAA0]] ; AVX-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, [[TBAA0]] -; AVX-NEXT: [[T4:%.*]] = add i32 [[T3]], 1 -; AVX-NEXT: [[T8:%.*]] = add i32 [[T7]], 2 -; AVX-NEXT: [[T12:%.*]] = add i32 [[T11]], 3 -; AVX-NEXT: [[T16:%.*]] = add i32 [[T15]], 4 -; AVX-NEXT: [[T20:%.*]] = add i32 [[T19]], 1 -; AVX-NEXT: [[T24:%.*]] = add i32 [[T23]], 2 -; AVX-NEXT: [[T28:%.*]] = add i32 [[T27]], 3 -; AVX-NEXT: [[T32:%.*]] = add i32 [[T31]], 4 -; AVX-NEXT: store i32 [[T4]], i32* [[T0]], align 4, [[TBAA0]] -; AVX-NEXT: store i32 [[T8]], i32* [[T5]], align 4, [[TBAA0]] -; AVX-NEXT: store i32 [[T12]], i32* [[T9]], align 4, [[TBAA0]] -; AVX-NEXT: store i32 [[T16]], i32* [[T13]], align 4, [[TBAA0]] -; AVX-NEXT: store i32 [[T20]], i32* [[T17]], align 4, [[TBAA0]] -; AVX-NEXT: store i32 [[T24]], i32* [[T21]], align 4, [[TBAA0]] -; AVX-NEXT: store i32 [[T28]], i32* [[T25]], align 4, [[TBAA0]] -; AVX-NEXT: store i32 [[T32]], i32* [[T29]], align 4, [[TBAA0]] +; AVX-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> undef, i32 [[T3]], i32 0 +; AVX-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[T7]], i32 1 +; AVX-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[T11]], i32 2 +; AVX-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[T15]], i32 3 +; AVX-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[T19]], i32 4 +; AVX-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[T23]], i32 5 +; AVX-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T27]], i32 6 +; AVX-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[T31]], i32 7 +; AVX-NEXT: [[TMP9:%.*]] = add <8 x i32> [[TMP8]], +; AVX-NEXT: [[TMP10:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>* +; AVX-NEXT: store <8 x i32> [[TMP9]], <8 x i32>* [[TMP10]], align 4, [[TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_4( Index: llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll +++ llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll @@ -1,24 +1,44 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck %s +; RUN: opt -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 -slp-throttle=false < %s | FileCheck %s --check-prefixes=NO_THROTTLE define dso_local void @rftbsub(double* %a) local_unnamed_addr #0 { ; CHECK-LABEL: @rftbsub( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 2 -; CHECK-NEXT: [[TMP0:%.*]] = load double, double* [[ARRAYIDX6]], align 8 -; CHECK-NEXT: [[TMP1:%.*]] = or i64 2, 1 -; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP2:%.*]] = load double, double* [[ARRAYIDX12]], align 8 -; CHECK-NEXT: [[ADD16:%.*]] = fadd double [[TMP2]], undef +; CHECK-NEXT: [[TMP0:%.*]] = or i64 2, 1 +; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[ARRAYIDX6]] to <2 x double>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 1 +; CHECK-NEXT: [[ADD16:%.*]] = fadd double [[TMP3]], undef ; CHECK-NEXT: [[MUL18:%.*]] = fmul double undef, [[ADD16]] ; CHECK-NEXT: [[ADD19:%.*]] = fadd double undef, [[MUL18]] ; CHECK-NEXT: [[SUB22:%.*]] = fsub double undef, undef -; CHECK-NEXT: [[SUB25:%.*]] = fsub double [[TMP0]], [[ADD19]] -; CHECK-NEXT: store double [[SUB25]], double* [[ARRAYIDX6]], align 8 -; CHECK-NEXT: [[SUB29:%.*]] = fsub double [[TMP2]], [[SUB22]] -; CHECK-NEXT: store double [[SUB29]], double* [[ARRAYIDX12]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> undef, double [[ADD19]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[SUB22]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[ARRAYIDX6]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP6]], <2 x double>* [[TMP7]], align 8 ; CHECK-NEXT: unreachable ; +; NO_THROTTLE-LABEL: @rftbsub( +; NO_THROTTLE-NEXT: entry: +; NO_THROTTLE-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 2 +; NO_THROTTLE-NEXT: [[TMP0:%.*]] = load double, double* [[ARRAYIDX6]], align 8 +; NO_THROTTLE-NEXT: [[TMP1:%.*]] = or i64 2, 1 +; NO_THROTTLE-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP1]] +; NO_THROTTLE-NEXT: [[TMP2:%.*]] = load double, double* [[ARRAYIDX12]], align 8 +; NO_THROTTLE-NEXT: [[ADD16:%.*]] = fadd double [[TMP2]], undef +; NO_THROTTLE-NEXT: [[MUL18:%.*]] = fmul double undef, [[ADD16]] +; NO_THROTTLE-NEXT: [[ADD19:%.*]] = fadd double undef, [[MUL18]] +; NO_THROTTLE-NEXT: [[SUB22:%.*]] = fsub double undef, undef +; NO_THROTTLE-NEXT: [[SUB25:%.*]] = fsub double [[TMP0]], [[ADD19]] +; NO_THROTTLE-NEXT: store double [[SUB25]], double* [[ARRAYIDX6]], align 8 +; NO_THROTTLE-NEXT: [[SUB29:%.*]] = fsub double [[TMP2]], [[SUB22]] +; NO_THROTTLE-NEXT: store double [[SUB29]], double* [[ARRAYIDX12]], align 8 +; NO_THROTTLE-NEXT: unreachable +; entry: %arrayidx6 = getelementptr inbounds double, double* %a, i64 2 %0 = load double, double* %arrayidx6, align 8