Index: llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -29,6 +29,7 @@ #include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallString.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/iterator.h" @@ -117,8 +118,17 @@ "number ")); static cl::opt -ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, - cl::desc("Attempt to vectorize horizontal reductions")); + ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, + cl::desc("Attempt to vectorize horizontal reductions")); + +static cl::opt + SLPThrottling("slp-throttle", cl::init(true), cl::Hidden, + cl::desc("Enable tree partial vectorize with throttling")); + +static cl::opt + MaxCostsRecalculations("slp-throttling-budget", cl::init(128), cl::Hidden, + cl::desc("Limit the total number of nodes for cost " + "recalculations during throttling")); static cl::opt ShouldStartVectorizeHorAtStore( "slp-vectorize-hor-store", cl::init(false), cl::Hidden, @@ -556,9 +566,55 @@ /// holding live values over call sites. int getSpillCost() const; + /// \returns the cost extracting vectorized elements. + int getExtractCost() const; + + /// \returns the cost of gathering canceled elements to be used + /// by vectorized operations during throttling. + int getInsertCost() const; + + /// Cut given path until it might be good to vectorize. + bool cutPath(int &Cost, ArrayRef Path); + + /// Find a non-gathering leaf node from current node C and record the path + /// on the way. + TreeEntry *findLeaf(TreeEntry *C, SmallVectorImpl &Path) const; + + /// Find a subtree of the whole tree suitable to be vectorized. When + /// vectorizing the whole tree is not profitable, we can consider vectorizing + /// part of that tree. SLP algorithm looks to operations to vectorize starting + /// from seed instructions on the bottom toward the end of chains of + /// dependencies to the top of SLP graph, it groups potentially vectorizable + /// operations in scalar form to bundles. + /// For example: + /// + /// scalar form + /// | + /// scalar form scalar form + /// \ / + /// scalar form + /// + /// Total cost is not profitable to vectorize, hence all operations are in + /// scalar form. + /// + /// Here is the same tree after SLP throttling transformation: + /// + /// vector form + /// | + /// vector form scalar form + /// \ / + /// vector form + /// + /// So, we can throttle some operations in such a way that it is still + /// profitable to vectorize part on the tree, while all tree vectorization + /// does not make sense. + /// More details: http://www.llvm.org/devmtg/2015-10/slides/Porpodas-ThrottlingAutomaticVectorization.pdf + Optional findSubTree(int Cost); + /// \returns the vectorization cost of the subtree that starts at \p VL. - /// A negative number means that this is profitable. - int getTreeCost(); + /// A negative number means that this is profitable. Also, the raw summary of + /// the subtree without extract cost, spill cost and etc. + std::pair getTreeCost(); /// Construct a vectorizable tree that starts at \p Roots, ignoring users for /// the purpose of scheduling and extraction in the \p UserIgnoreLst. @@ -579,6 +635,8 @@ ScalarToTreeEntry.clear(); MustGather.clear(); ExternalUses.clear(); + InternalTreeUses.clear(); + RemovedOperations.clear(); NumOpsWantToKeepOrder.clear(); NumOpsWantToKeepOriginalOrder = 0; for (auto &Iter : BlocksSchedules) { @@ -586,6 +644,9 @@ BS->clear(); } MinBWs.clear(); + ScalarsToVec.clear(); + VecToScalars.clear(); + CostsRecalculations = 0; } unsigned getTreeSize() const { return VectorizableTree.size(); } @@ -641,6 +702,9 @@ /// vectorizable. We do not vectorize such trees. bool isTreeTinyAndNotFullyVectorizable() const; + /// Estimate the subtree not just from a cost perspective, but functional. + bool isGoodSubTreeToVectorize() const; + /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values /// can be load combined in the backend. Load combining may not be allowed in /// the IR optimizer, so we do not want to alter the pattern. For example, @@ -650,6 +714,9 @@ /// may not be necessary. bool isLoadCombineReductionCandidate(unsigned ReductionOpcode) const; + /// Try to cut the tree to make it partially vectorizable. + bool cutTree(); + OptimizationRemarkEmitter *getORE() { return ORE; } /// This structure holds any data we need about the edges being traversed @@ -1432,7 +1499,8 @@ Value *VectorizedValue = nullptr; /// Do we need to gather this sequence ? - bool NeedToGather = false; + enum EntryState { Vectorize, NeedToGather, ProposedToGather }; + EntryState State; /// Does this sequence require some shuffling? SmallVector ReuseShuffleIndices; @@ -1440,6 +1508,9 @@ /// Does this entry require reordering? ArrayRef ReorderIndices; + /// Cost of this tree entry. + int Cost = 0; + /// Points back to the VectorizableTree. /// /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has @@ -1452,6 +1523,9 @@ /// have multiple users so the data structure is not truly a tree. SmallVector UserTreeIndices; + /// The index containing the use of this entry. + TinyPtrVector UseTreeIndices; + /// The index of this treeEntry in VectorizableTree. int Idx = -1; @@ -1562,6 +1636,13 @@ return true; } + // Find nodes with more than one use. + bool isBranch() const { + return llvm::count_if(UseTreeIndices, [this](TreeEntry *Next) { + return (Next->Idx != Idx && Next->State == TreeEntry::Vectorize); + }) > 1; + } + #ifndef NDEBUG /// Debug printer. LLVM_DUMP_METHOD void dump() const { @@ -1574,7 +1655,18 @@ dbgs() << "Scalars: \n"; for (Value *V : Scalars) dbgs().indent(2) << *V << "\n"; - dbgs() << "NeedToGather: " << NeedToGather << "\n"; + dbgs() << "State: "; + switch (State) { + case Vectorize: + dbgs() << "Vectorize\n"; + break; + case NeedToGather: + dbgs() << "NeedToGather\n"; + break; + case ProposedToGather: + dbgs() << "ProposedToGather\n"; + break; + } dbgs() << "MainOp: "; if (MainOp) dbgs() << *MainOp << "\n"; @@ -1594,12 +1686,12 @@ if (ReuseShuffleIndices.empty()) dbgs() << "Emtpy"; else - for (unsigned ReuseIdx : ReuseShuffleIndices) - dbgs() << ReuseIdx << ", "; + for (unsigned Idx : ReuseShuffleIndices) + dbgs() << Idx << ", "; dbgs() << "\n"; dbgs() << "ReorderIndices: "; - for (unsigned ReorderIdx : ReorderIndices) - dbgs() << ReorderIdx << ", "; + for (unsigned Idx : ReorderIndices) + dbgs() << Idx << ", "; dbgs() << "\n"; dbgs() << "UserTreeIndices: "; for (const auto &EInfo : UserTreeIndices) @@ -1620,7 +1712,7 @@ TreeEntry *Last = VectorizableTree.back().get(); Last->Idx = VectorizableTree.size() - 1; Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end()); - Last->NeedToGather = !Vectorized; + Last->State = Vectorized ? TreeEntry::Vectorize : TreeEntry::NeedToGather; Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(), ReuseShuffleIndices.end()); Last->ReorderIndices = ReorderIndices; @@ -1644,8 +1736,10 @@ MustGather.insert(VL.begin(), VL.end()); } - if (UserTreeIdx.UserTE) + if (UserTreeIdx.UserTE) { Last->UserTreeIndices.push_back(UserTreeIdx); + VectorizableTree[UserTreeIdx.UserTE->Idx]->UseTreeIndices.push_back(Last); + } return Last; } @@ -1681,6 +1775,16 @@ /// Maps a specific scalar to its tree entry. SmallDenseMap ScalarToTreeEntry; + /// Tree entries that should not be vectorized due to throttling. + SmallVector RemovedOperations; + + /// Tree values proposed to be vectorized. + ValueSet ScalarsToVec; + + /// Tree values once considered to be vectorized, but later with throttling + /// decided to stay in a scalar form. + ValueSet VecToScalars; + /// A list of scalars that we found that we need to keep as scalars. ValueSet MustGather; @@ -1700,6 +1804,9 @@ }; using UserList = SmallVector; + /// \returns the cost of extracting the vectorized elements. + int getExtractOperationCost(const ExternalUser &EU) const; + /// Checks if two instructions may access the same memory. /// /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it @@ -1750,6 +1857,13 @@ /// after vectorization. UserList ExternalUses; + /// Number of times in nodes that we already recalulated cost of + /// the subtree during throtteling. + int CostsRecalculations = 0; + + /// Internal tree oprations proposed to be vectorized values use. + SmallDenseMap InternalTreeUses; + /// Values used only by @llvm.assume calls. SmallPtrSet EphValues; @@ -2136,6 +2250,9 @@ /// Attaches the BlockScheduling structures to basic blocks. MapVector> BlocksSchedules; + /// Remove operations from the list of proposed to schedule. + void removeFromScheduling(BlockScheduling *BS); + /// Performs the "real" scheduling. Done before vectorization is actually /// performed in a basic block. void scheduleBlock(BlockScheduling *BS); @@ -2291,7 +2408,7 @@ static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *) { - if (Entry->NeedToGather) + if (Entry->State == TreeEntry::NeedToGather) return "color=red"; return ""; } @@ -2339,11 +2456,11 @@ buildTree_rec(Roots, 0, EdgeInfo()); // Collect the values that we need to extract from the tree. - for (auto &TEPtr : VectorizableTree) { + for (std::unique_ptr &TEPtr : VectorizableTree) { TreeEntry *Entry = TEPtr.get(); // No need to handle users of gathered values. - if (Entry->NeedToGather) + if (Entry->State == TreeEntry::NeedToGather) continue; // For each lane: @@ -2380,7 +2497,8 @@ !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) { LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U << ".\n"); - assert(!UseEntry->NeedToGather && "Bad state"); + assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state"); + InternalTreeUses[U].emplace_back(Scalar, U, FoundLane); continue; } } @@ -3088,6 +3206,64 @@ } } +bool BoUpSLP::cutTree() { + SmallPtrSet Removed; + SmallVector VecNodes; + if (!isGoodSubTreeToVectorize()) + return false; + for (std::unique_ptr &TEPtr : VectorizableTree) { + TreeEntry *Entry = TEPtr.get(); + if (Entry->State == TreeEntry::Vectorize) + VecNodes.push_back(Entry); + } + if (VecNodes.size() <= 2) + return false; + // Canceling unprofitable elements. + for (std::unique_ptr &TEPtr : VectorizableTree) { + TreeEntry *Entry = TEPtr.get(); + if (Entry->State == TreeEntry::NeedToGather) + continue; + if (Entry->State == TreeEntry::ProposedToGather) { + Entry->State = TreeEntry::NeedToGather; + for (Value *V : Entry->Scalars) { + LLVM_DEBUG(dbgs() << "SLP: Remove scalar " << *V + << " out of proposed to vectorize.\n"); + ScalarToTreeEntry.erase(V); + Removed.insert(V); + RemovedOperations.push_back(Entry); + MustGather.insert(V); + ExternalUses.erase( + llvm::remove_if(ExternalUses, + [&V](ExternalUser &EU) { return EU.Scalar == V; }), + ExternalUses.end()); + } + } + } + // For all canceled operations we should consider the possibility of + // use by with non-canceled operations and for that, it requires + // to populate ExternalUser list with canceled elements. + for (TreeEntry *Entry : VecNodes) + for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { + Value *Scalar = Entry->Scalars[Lane]; + for (User *U : Scalar->users()) { + LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n"); + auto *UserInst = dyn_cast(U); + if (!UserInst) + continue; + if (!Removed.count(U)) + continue; + // Ignore users in the user ignore list. + if (is_contained(UserIgnoreList, UserInst)) + continue; + LLVM_DEBUG(dbgs() << "SLP: Need to extract canceled operation :" << *U + << " from lane " << Lane << " from " << *Scalar + << ".\n"); + ExternalUses.emplace_back(Scalar, U, Lane); + } + } + return true; +} + unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const { unsigned N = 1; Type *EltTy = T; @@ -3212,7 +3388,7 @@ ReuseShuffleCost = TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy); } - if (E->NeedToGather) { + if (E->State == TreeEntry::NeedToGather) { if (allConstant(VL)) return 0; if (isSplat(VL)) { @@ -3280,7 +3456,7 @@ TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx); } } - if (!E->NeedToGather) { + if (E->State == TreeEntry::Vectorize) { int DeadCost = ReuseShuffleCost; if (!E->ReorderIndices.empty()) { // TODO: Merge this shuffle with the ReuseShuffleCost. @@ -3566,20 +3742,22 @@ << VectorizableTree.size() << " is fully vectorizable .\n"); // We only handle trees of heights 1 and 2. - if (VectorizableTree.size() == 1 && !VectorizableTree[0]->NeedToGather) + if (VectorizableTree.size() == 1 && + VectorizableTree[0]->State == TreeEntry::Vectorize) return true; if (VectorizableTree.size() != 2) return false; // Handle splat and all-constants stores. - if (!VectorizableTree[0]->NeedToGather && + if (VectorizableTree[0]->State == TreeEntry::Vectorize && (allConstant(VectorizableTree[1]->Scalars) || isSplat(VectorizableTree[1]->Scalars))) return true; // Gathering cost would be too much for tiny trees. - if (VectorizableTree[0]->NeedToGather || VectorizableTree[1]->NeedToGather) + if (VectorizableTree[0]->State == TreeEntry::NeedToGather || + VectorizableTree[1]->State == TreeEntry::NeedToGather) return false; return true; @@ -3642,6 +3820,19 @@ return true; } +bool BoUpSLP::isGoodSubTreeToVectorize() const { + for (const std::unique_ptr &TEPtr : VectorizableTree) { + TreeEntry *Entry = TEPtr.get(); + if (Entry->State != TreeEntry::Vectorize) + continue; + Instruction *Inst = Entry->getMainOp(); + if (Inst && (isa(Inst) || isa(Inst) || + isa(Inst))) + return true; + } + return false; +} + int BoUpSLP::getSpillCost() const { // Walk from the bottom of the tree to the top, tracking which values are // live. When we see a call instruction that is not part of our tree, @@ -3653,7 +3844,7 @@ SmallPtrSet LiveValues; Instruction *PrevInst = nullptr; - for (const auto &TEPtr : VectorizableTree) { + for (const std::unique_ptr &TEPtr : VectorizableTree) { Instruction *Inst = dyn_cast(TEPtr->Scalars[0]); if (!Inst) continue; @@ -3666,7 +3857,7 @@ // Update LiveValues. LiveValues.erase(PrevInst); for (auto &J : PrevInst->operands()) { - if (isa(&*J) && getTreeEntry(&*J)) + if (isa(&*J) && ScalarsToVec.count(&*J)) LiveValues.insert(cast(&*J)); } @@ -3711,15 +3902,201 @@ return Cost; } -int BoUpSLP::getTreeCost() { - int Cost = 0; +int BoUpSLP::getExtractOperationCost(const ExternalUser &EU) const { + unsigned BundleWidth = VectorizableTree.front()->Scalars.size(); + + // Uses by ephemeral values are free (because the ephemeral value will be + // removed prior to code generation, and so the extraction will be + // removed as well). + if (EphValues.count(EU.User)) + return 0; + + // If we plan to rewrite the tree in a smaller type, we will need to sign + // extend the extracted value back to the original type. Here, we account + // for the extract and the added cost of the sign extend if needed. + auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth); + Value *ScalarRoot = VectorizableTree[0]->Scalars[0]; + + auto It = MinBWs.find(ScalarRoot); + if (It != MinBWs.end()) { + uint64_t Width = It->second.first; + bool Signed = It->second.second; + auto *MinTy = IntegerType::get(F->getContext(), Width); + unsigned ExtOp = Signed ? Instruction::SExt : Instruction::ZExt; + VecTy = VectorType::get(MinTy, BundleWidth); + return (TTI->getExtractWithExtendCost(ExtOp, EU.Scalar->getType(), VecTy, + EU.Lane)); + } + return TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane); +} + +int BoUpSLP::getExtractCost() const { + int ExtractCost = 0; + SmallPtrSet ExtractCostCalculated; + for (const ExternalUser &EU : ExternalUses) { + // We only add extract cost once for the same scalar. + if (!ExtractCostCalculated.insert(EU.Scalar).second) + continue; + + // Avoid non-vectorized scalars. + if (!ScalarsToVec.count(EU.Scalar)) { + // Consider the possibility of extracting vectorized + // values for canceled elements use. + auto It = InternalTreeUses.find(EU.Scalar); + if (It != InternalTreeUses.end()) { + const UserList &UL = It->second; + for (const ExternalUser &IU : UL) + ExtractCost += getExtractOperationCost(IU); + } + continue; + } + ExtractCost += getExtractOperationCost(EU); + } + return ExtractCost; +} + +int BoUpSLP::getInsertCost() const { + int InsertCost = 0; + for (const std::unique_ptr &TEPtr : VectorizableTree) { + TreeEntry *Entry = TEPtr.get(); + // Avoid already vectorized TreeEntries, it is already in a vector form and + // we don't need to gather those operations. + if (Entry->State == TreeEntry::NeedToGather || Entry->VectorizedValue) + continue; + for (Value *V : Entry->Scalars) { + auto *Inst = cast(V); + for (Use &U : Inst->operands()) { + Value *Op = U.get(); + if (VecToScalars.count(Op)) + InsertCost += getGatherCost(Op); + } + } + } + return InsertCost; +} + +bool BoUpSLP::cutPath(int &Cost, ArrayRef Path) { + // Decrement nodes one by one until Path is empty or we find a suitable set + // of nodes for partial tree vectorization + for (TreeEntry *N : Path) { + CostsRecalculations++; + + // Stop if we are over our budget of maximum cost calculations. + if (CostsRecalculations >= MaxCostsRecalculations) + break; + + // We are no longer propose to vectorize this node and we substitute + // cost of this node from the cost of all vectorizable nodes. + assert(N->State == TreeEntry::Vectorize && + "Incorrect node state, visiting twice."); + N->State = TreeEntry::ProposedToGather; + Cost -= N->Cost; + for (Value *V : N->Scalars) { + ScalarsToVec.erase(V); + VecToScalars.insert(V); + } + int PartialCost = Cost; + PartialCost += getExtractCost() + getSpillCost() + getInsertCost(); + if (PartialCost < -SLPCostThreshold && cutTree()) + return true; + } + return false; +} + +BoUpSLP::TreeEntry * +BoUpSLP::findLeaf(TreeEntry *C, SmallVectorImpl &Path) const { + int NonGatherUse; + if (!is_contained(Path, C)) + Path.push_back(C); + do { + NonGatherUse = 0; + for (TreeEntry *Next : llvm::reverse(C->UseTreeIndices)) { + if (Next->State == TreeEntry::NeedToGather) + continue; + // Ignore any processed nodes to avoid cycles. + if (Next->State == TreeEntry::ProposedToGather || + is_contained(Path, Next) || Next == C) + continue; + C = Next; + Path.push_back(C); + NonGatherUse++; + break; + } + } while (NonGatherUse != 0); + return C; +} + +Optional BoUpSLP::findSubTree(int Cost) { + SmallVector Path; + SmallVector SubPath; + TreeEntry *Node = nullptr; + + // To start we can find just one leaf node that happens to be not the root + // node of the graph i.e. with non-zero index. Then, Path is route from the + // root node to our leaf node. + if (!findLeaf(VectorizableTree[0].get(), Path)->Idx) + return None; + do { + Node = Path.back(); + assert(Node->State == TreeEntry::Vectorize && "Incorrect node state"); + // If we found a branch node i.e. node with more than one non-gathering + // child, we could try to find set of profitable nodes in SubPath to + // vectorize and if there is no such set of profitable nodes then we could + // consider another leaf that is reachable from this branch node. + if (Node->isBranch()) { + if (cutPath(Cost, SubPath)) + return Cost; + if (CostsRecalculations >= MaxCostsRecalculations) { + SubPath.clear(); + break; + } + TreeEntry *NextFromBranch = nullptr; + auto It = llvm::find_if( + llvm::reverse(Node->UseTreeIndices), [&Node, &Path](TreeEntry *E) { + return (E != Node && E->State == TreeEntry::Vectorize && + !is_contained(Path, E)); + }); + if (It != Node->UseTreeIndices.rend()) + NextFromBranch = *It; + SubPath.clear(); + if (NextFromBranch && NextFromBranch != Node) + Node = findLeaf(NextFromBranch, Path); + } else { + // If this node is not a branch node then we could move to another node + // below until we reach the root node of the graph or encounter another + // branch node. + SubPath.push_back(Node); + Path.pop_back(); + } + } while (Node->Idx); + + // We don't have any branches now and reduce single remaining path now. + if (!SubPath.empty()) { + if (cutPath(Cost, SubPath)) + return Cost; + } + +#ifndef NDEBUG + // Make sure that we have processed all nodes. + if (CostsRecalculations < MaxCostsRecalculations) + for (std::unique_ptr &TEPtr : VectorizableTree) { + TreeEntry *Entry = TEPtr.get(); + if (Entry->State == TreeEntry::NeedToGather) + continue; + assert(Entry->State == TreeEntry::ProposedToGather && + "Incorrect node state"); + } +#endif + return None; +} + +std::pair BoUpSLP::getTreeCost() { + int CostSum = 0; LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size " << VectorizableTree.size() << ".\n"); - unsigned BundleWidth = VectorizableTree[0]->Scalars.size(); - - for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) { - TreeEntry &TE = *VectorizableTree[I].get(); + for (std::unique_ptr &TEPtr : VectorizableTree) { + TreeEntry &TE = *TEPtr.get(); // We create duplicate tree entries for gather sequences that have multiple // uses. However, we should not compute the cost of duplicate sequences. @@ -3733,68 +4110,50 @@ // their uses. Since such an approach results in fewer total entries, // existing heuristics based on tree size may yield different results. // - if (TE.NeedToGather && - std::any_of( - std::next(VectorizableTree.begin(), I + 1), VectorizableTree.end(), - [TE](const std::unique_ptr &EntryPtr) { - return EntryPtr->NeedToGather && EntryPtr->isSame(TE.Scalars); - })) + if (TE.State == TreeEntry::NeedToGather && + llvm::any_of(llvm::drop_begin(VectorizableTree, TE.Idx + 1), + [TE](const std::unique_ptr &EntryPtr) { + return EntryPtr->State == TreeEntry::NeedToGather && + EntryPtr->isSame(TE.Scalars); + })) continue; - int C = getEntryCost(&TE); - LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C + if (TE.State == TreeEntry::Vectorize) + ScalarsToVec.insert(TE.Scalars.begin(), TE.Scalars.end()); + + TE.Cost = getEntryCost(&TE); + LLVM_DEBUG(dbgs() << "SLP: Adding cost " << TE.Cost << " for bundle that starts with " << *TE.Scalars[0] << ".\n"); - Cost += C; + CostSum += TE.Cost; } - SmallPtrSet ExtractCostCalculated; - int ExtractCost = 0; - for (ExternalUser &EU : ExternalUses) { - // We only add extract cost once for the same scalar. - if (!ExtractCostCalculated.insert(EU.Scalar).second) - continue; - - // Uses by ephemeral values are free (because the ephemeral value will be - // removed prior to code generation, and so the extraction will be - // removed as well). - if (EphValues.count(EU.User)) - continue; - - // If we plan to rewrite the tree in a smaller type, we will need to sign - // extend the extracted value back to the original type. Here, we account - // for the extract and the added cost of the sign extend if needed. - auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth); - auto *ScalarRoot = VectorizableTree[0]->Scalars[0]; - if (MinBWs.count(ScalarRoot)) { - auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first); - auto Extend = - MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt; - VecTy = VectorType::get(MinTy, BundleWidth); - ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(), - VecTy, EU.Lane); - } else { - ExtractCost += - TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane); + if (SLPThrottling) + for (std::unique_ptr &TEPtr : VectorizableTree) { + TreeEntry *TE = TEPtr.get(); + if (TE->State == TreeEntry::NeedToGather) + continue; + int GatherCost = 0; + for (TreeEntry *Gather : TE->UseTreeIndices) + if (Gather->State == TreeEntry::NeedToGather) + GatherCost += Gather->Cost; + TE->Cost = TE->Cost + GatherCost; } - } + int ExtractCost = getExtractCost(); int SpillCost = getSpillCost(); - Cost += SpillCost + ExtractCost; + int Cost = CostSum + ExtractCost + SpillCost; - std::string Str; - { - raw_string_ostream OS(Str); - OS << "SLP: Spill Cost = " << SpillCost << ".\n" - << "SLP: Extract Cost = " << ExtractCost << ".\n" - << "SLP: Total Cost = " << Cost << ".\n"; - } + SmallString<256> Str; + raw_svector_ostream OS(Str); + OS << "SLP: Spill Cost = " << SpillCost << ".\n" + << "SLP: Extract Cost = " << ExtractCost << ".\n" + << "SLP: Total Cost = " << Cost << ".\n"; LLVM_DEBUG(dbgs() << Str); - if (ViewSLPTree) ViewGraph(this, "SLP" + F->getName(), false, Str); - return Cost; + return std::make_pair(Cost, CostSum); } int BoUpSLP::getGatherCost(Type *Ty, @@ -4029,7 +4388,7 @@ bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); - if (E->NeedToGather) { + if (E->State == TreeEntry::NeedToGather) { setInsertPointAfterBundle(E); auto *V = Gather(E->Scalars, VecTy); if (NeedToShuffleReuses) { @@ -4084,7 +4443,7 @@ } case Instruction::ExtractElement: { - if (!E->NeedToGather) { + if (E->State == TreeEntry::Vectorize) { Value *V = E->getSingleOperand(0); if (!E->ReorderIndices.empty()) { OrdersType Mask; @@ -4117,7 +4476,7 @@ return V; } case Instruction::ExtractValue: { - if (!E->NeedToGather) { + if (E->State == TreeEntry::Vectorize) { LoadInst *LI = cast(E->getSingleOperand(0)); Builder.SetInsertPoint(LI); PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace()); @@ -4559,7 +4918,12 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { // All blocks must be scheduled before any instructions are inserted. for (auto &BSIter : BlocksSchedules) { - scheduleBlock(BSIter.second.get()); + BlockScheduling *BS = BSIter.second.get(); + // Remove all Schedule Data from all nodes that we have changed + // vectorization decision. + if (!RemovedOperations.empty()) + removeFromScheduling(BS); + scheduleBlock(BS); } Builder.SetInsertPoint(&F->getEntryBlock().front()); @@ -4603,7 +4967,7 @@ continue; TreeEntry *E = getTreeEntry(Scalar); assert(E && "Invalid scalar"); - assert(!E->NeedToGather && "Extracting from a gather list"); + assert(E->State == TreeEntry::Vectorize && "Extracting from a gather list"); Value *Vec = E->VectorizedValue; assert(Vec && "Can't find vectorizable value"); @@ -4672,11 +5036,11 @@ } // For each vectorized value: - for (auto &TEPtr : VectorizableTree) { + for (std::unique_ptr &TEPtr : VectorizableTree) { TreeEntry *Entry = TEPtr.get(); // No need to handle users of gathered values. - if (Entry->NeedToGather) + if (Entry->State == TreeEntry::NeedToGather) continue; assert(Entry->VectorizedValue && "Can't find vectorizable value"); @@ -4687,7 +5051,9 @@ #ifndef NDEBUG Type *Ty = Scalar->getType(); - if (!Ty->isVoidTy()) { + // The tree might not be fully vectorized, so we don't have to + // check every user. + if (!Ty->isVoidTy() && RemovedOperations.empty()) { for (User *U : Scalar->users()) { LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n"); @@ -5172,6 +5538,31 @@ ReadyInsts.clear(); } +void BoUpSLP::removeFromScheduling(BlockScheduling *BS) { + bool Removed = false; + for (TreeEntry *Entry : RemovedOperations) { + ScheduleData *SD = BS->getScheduleData(Entry->Scalars[0]); + if (SD && SD->isPartOfBundle()) { + if (!Removed) { + Removed = true; + BS->resetSchedule(); + } + BS->cancelScheduling(Entry->Scalars, SD->OpValue); + } + } + if (!Removed) + return; + BS->resetSchedule(); + BS->initialFillReadyList(BS->ReadyInsts); + for (Instruction *I = BS->ScheduleStart; I != BS->ScheduleEnd; + I = I->getNextNode()) { + if (BS->ScheduleDataMap.find(I) == BS->ScheduleDataMap.end()) + continue; + BS->doForAllOpcodes(I, + [&](ScheduleData *SD) { SD->clearDependencies(); }); + } +} + void BoUpSLP::scheduleBlock(BlockScheduling *BS) { if (!BS->ScheduleStart) return; @@ -5679,7 +6070,9 @@ R.computeMinimumValueSizes(); - int Cost = R.getTreeCost(); + std::pair C = R.getTreeCost(); + int Cost = C.first; + int CostSum = C.second; LLVM_DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n"); if (Cost < -SLPCostThreshold) { @@ -5695,6 +6088,15 @@ R.vectorizeTree(); return true; + } else { + if (SLPThrottling) { + Optional PartialCost = R.findSubTree(CostSum); + if (PartialCost.hasValue() && + PartialCost.getValue() < -SLPCostThreshold) { + R.vectorizeTree(); + return true; + } + } } return false; @@ -5928,23 +6330,34 @@ continue; R.computeMinimumValueSizes(); - int Cost = R.getTreeCost() - UserCost; + std::pair C = R.getTreeCost(); + int Cost = C.first - UserCost; + int CostSum = C.second; CandidateFound = true; MinCost = std::min(MinCost, Cost); if (Cost < -SLPCostThreshold) { LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n"); R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList", - cast(Ops[0])) - << "SLP vectorized with cost " << ore::NV("Cost", Cost) - << " and with tree size " - << ore::NV("TreeSize", R.getTreeSize())); + cast(Ops[0])) + << "SLP vectorized with cost " << ore::NV("Cost", Cost) + << " and with tree size " + << ore::NV("TreeSize", R.getTreeSize())); R.vectorizeTree(); // Move to the next bundle. I += VF - 1; NextInst = I + 1; Changed = true; + } else { + if (SLPThrottling) { + Optional PartialCost = R.findSubTree(CostSum); + if (PartialCost.hasValue() && + PartialCost.getValue() < -SLPCostThreshold) { + R.vectorizeTree(); + Changed = true; + } + } } } } @@ -6726,7 +7139,7 @@ V.computeMinimumValueSizes(); // Estimate cost. - int TreeCost = V.getTreeCost(); + int TreeCost = V.getTreeCost().first; int ReductionCost = getReductionCost(TTI, ReducedVals[i], ReduxWidth); int Cost = TreeCost + ReductionCost; if (Cost >= -SLPCostThreshold) { Index: llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll +++ llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll @@ -34,21 +34,21 @@ define void @store_chain_v2i64(i64* %a, i64* %b, i64* %c) { ; CHECK-LABEL: @store_chain_v2i64( -; CHECK-NEXT: [[A_1:%.*]] = getelementptr i64, i64* [[A:%.*]], i64 1 -; CHECK-NEXT: [[B_1:%.*]] = getelementptr i64, i64* [[B:%.*]], i64 1 -; CHECK-NEXT: [[C_1:%.*]] = getelementptr i64, i64* [[C:%.*]], i64 1 -; CHECK-NEXT: [[V0_0:%.*]] = load i64, i64* [[A]], align 8 -; CHECK-NEXT: [[V0_1:%.*]] = load i64, i64* [[A_1]], align 8 -; CHECK-NEXT: [[V1_0:%.*]] = load i64, i64* [[B]], align 8 -; CHECK-NEXT: [[V1_1:%.*]] = load i64, i64* [[B_1]], align 8 -; CHECK-NEXT: [[TMP0_0:%.*]] = add i64 [[V0_0]], [[V1_0]] -; CHECK-NEXT: [[TMP0_1:%.*]] = add i64 [[V0_1]], [[V1_1]] -; CHECK-NEXT: [[TMP1_0:%.*]] = sub i64 [[V0_0]], [[V1_0]] -; CHECK-NEXT: [[TMP1_1:%.*]] = sub i64 [[V0_1]], [[V1_1]] -; CHECK-NEXT: [[TMP2_0:%.*]] = add i64 [[TMP0_0]], [[TMP0_1]] -; CHECK-NEXT: [[TMP2_1:%.*]] = add i64 [[TMP1_0]], [[TMP1_1]] -; CHECK-NEXT: store i64 [[TMP2_0]], i64* [[C]], align 8 -; CHECK-NEXT: store i64 [[TMP2_1]], i64* [[C_1]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64* [[A:%.*]] to <2 x i64>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[B:%.*]] to <2 x i64>* +; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[TMP3]], align 8 +; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <2 x i32> +; CHECK-NEXT: [[REORDER_SHUFFLE1:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> undef, <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[REORDER_SHUFFLE]], [[REORDER_SHUFFLE1]] +; CHECK-NEXT: [[TMP6:%.*]] = sub <2 x i64> [[REORDER_SHUFFLE]], [[REORDER_SHUFFLE1]] +; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP8:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP7]], <2 x i64> [[TMP8]], <2 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = add <2 x i64> [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i64* [[C:%.*]] to <2 x i64>* +; CHECK-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* [[TMP12]], align 8 ; CHECK-NEXT: ret void ; %a.0 = getelementptr i64, i64* %a, i64 0 Index: llvm/test/Transforms/SLPVectorizer/X86/PR31847.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/PR31847.ll +++ llvm/test/Transforms/SLPVectorizer/X86/PR31847.ll @@ -24,53 +24,59 @@ ; CHECK-NEXT: [[Y_045:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC_1:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[TMP4:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP4]] to i32 -; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[CONV]], -128 ; CHECK-NEXT: [[TMP5:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1 ; CHECK-NEXT: [[CONV3:%.*]] = zext i8 [[TMP5]] to i32 -; CHECK-NEXT: [[SUB4:%.*]] = add nsw i32 [[CONV3]], -128 -; CHECK-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[SUB]], -1 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> undef, i32 [[CONV3]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[CONV]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = add nsw <2 x i32> [[TMP7]], ; CHECK-NEXT: [[SUB7:%.*]] = sub nsw i32 128, [[CONV]] -; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP5]], i32 [[SUB]], i32 [[SUB7]] -; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[SUB4]], -1 +; CHECK-NEXT: [[TMP9:%.*]] = icmp sgt <2 x i32> [[TMP8]], ; CHECK-NEXT: [[SUB12:%.*]] = sub nsw i32 128, [[CONV3]] -; CHECK-NEXT: [[COND14:%.*]] = select i1 [[CMP8]], i32 [[SUB4]], i32 [[SUB12]] -; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[COND14]], [[COND]] +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i32> undef, i32 [[SUB12]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i32> [[TMP10]], i32 [[SUB7]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = select <2 x i1> [[TMP9]], <2 x i32> [[TMP8]], <2 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i32> [[TMP12]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i32> [[TMP12]], i32 1 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]] ; CHECK-NEXT: [[IDX_NEG:%.*]] = sub nsw i32 0, [[ADD]] ; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, i8* [[D1_DATA_046]], i32 [[IDX_NEG]] -; CHECK-NEXT: [[TMP6:%.*]] = load i8, i8* [[ADD_PTR]], align 1 -; CHECK-NEXT: [[CONV15:%.*]] = zext i8 [[TMP6]] to i32 +; CHECK-NEXT: [[TMP15:%.*]] = load i8, i8* [[ADD_PTR]], align 1 +; CHECK-NEXT: [[CONV15:%.*]] = zext i8 [[TMP15]] to i32 ; CHECK-NEXT: [[ADD16:%.*]] = add nsw i32 [[CONV15]], [[INTENSITY:%.*]] ; CHECK-NEXT: [[CONV17:%.*]] = trunc i32 [[ADD16]] to i8 ; CHECK-NEXT: store i8 [[CONV17]], i8* [[ADD_PTR]], align 1 ; CHECK-NEXT: [[ADD_PTR18:%.*]] = getelementptr inbounds i8, i8* [[D1_DATA_046]], i32 [[ADD]] -; CHECK-NEXT: [[TMP7:%.*]] = load i8, i8* [[ADD_PTR18]], align 1 -; CHECK-NEXT: [[NOT_TOBOOL:%.*]] = icmp eq i8 [[TMP7]], 0 +; CHECK-NEXT: [[TMP16:%.*]] = load i8, i8* [[ADD_PTR18]], align 1 +; CHECK-NEXT: [[NOT_TOBOOL:%.*]] = icmp eq i8 [[TMP16]], 0 ; CHECK-NEXT: [[CONV21:%.*]] = zext i1 [[NOT_TOBOOL]] to i8 ; CHECK-NEXT: store i8 [[CONV21]], i8* [[ADD_PTR18]], align 1 ; CHECK-NEXT: [[ADD_PTR23:%.*]] = getelementptr inbounds i8, i8* [[D1_DATA_046]], i32 [[TMP1]] -; CHECK-NEXT: [[TMP8:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[CONV_1:%.*]] = zext i8 [[TMP8]] to i32 -; CHECK-NEXT: [[SUB_1:%.*]] = add nsw i32 [[CONV_1]], -128 -; CHECK-NEXT: [[TMP9:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1 -; CHECK-NEXT: [[CONV3_1:%.*]] = zext i8 [[TMP9]] to i32 -; CHECK-NEXT: [[SUB4_1:%.*]] = add nsw i32 [[CONV3_1]], -128 -; CHECK-NEXT: [[CMP5_1:%.*]] = icmp sgt i32 [[SUB_1]], -1 +; CHECK-NEXT: [[TMP17:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[CONV_1:%.*]] = zext i8 [[TMP17]] to i32 +; CHECK-NEXT: [[TMP18:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1 +; CHECK-NEXT: [[CONV3_1:%.*]] = zext i8 [[TMP18]] to i32 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x i32> undef, i32 [[CONV3_1]], i32 0 +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <2 x i32> [[TMP19]], i32 [[CONV_1]], i32 1 +; CHECK-NEXT: [[TMP21:%.*]] = add nsw <2 x i32> [[TMP20]], ; CHECK-NEXT: [[SUB7_1:%.*]] = sub nsw i32 128, [[CONV_1]] -; CHECK-NEXT: [[COND_1:%.*]] = select i1 [[CMP5_1]], i32 [[SUB_1]], i32 [[SUB7_1]] -; CHECK-NEXT: [[CMP8_1:%.*]] = icmp sgt i32 [[SUB4_1]], -1 +; CHECK-NEXT: [[TMP22:%.*]] = icmp sgt <2 x i32> [[TMP21]], ; CHECK-NEXT: [[SUB12_1:%.*]] = sub nsw i32 128, [[CONV3_1]] -; CHECK-NEXT: [[COND14_1:%.*]] = select i1 [[CMP8_1]], i32 [[SUB4_1]], i32 [[SUB12_1]] -; CHECK-NEXT: [[ADD_1:%.*]] = add nsw i32 [[COND14_1]], [[COND_1]] +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <2 x i32> undef, i32 [[SUB12_1]], i32 0 +; CHECK-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> [[TMP23]], i32 [[SUB7_1]], i32 1 +; CHECK-NEXT: [[TMP25:%.*]] = select <2 x i1> [[TMP22]], <2 x i32> [[TMP21]], <2 x i32> [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <2 x i32> [[TMP25]], i32 0 +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <2 x i32> [[TMP25]], i32 1 +; CHECK-NEXT: [[ADD_1:%.*]] = add nsw i32 [[TMP26]], [[TMP27]] ; CHECK-NEXT: [[IDX_NEG_1:%.*]] = sub nsw i32 0, [[ADD_1]] ; CHECK-NEXT: [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR23]], i32 [[IDX_NEG_1]] -; CHECK-NEXT: [[TMP10:%.*]] = load i8, i8* [[ADD_PTR_1]], align 1 -; CHECK-NEXT: [[CONV15_1:%.*]] = zext i8 [[TMP10]] to i32 +; CHECK-NEXT: [[TMP28:%.*]] = load i8, i8* [[ADD_PTR_1]], align 1 +; CHECK-NEXT: [[CONV15_1:%.*]] = zext i8 [[TMP28]] to i32 ; CHECK-NEXT: [[ADD16_1:%.*]] = add nsw i32 [[CONV15_1]], [[INTENSITY]] ; CHECK-NEXT: [[CONV17_1:%.*]] = trunc i32 [[ADD16_1]] to i8 ; CHECK-NEXT: store i8 [[CONV17_1]], i8* [[ADD_PTR_1]], align 1 ; CHECK-NEXT: [[ADD_PTR18_1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR23]], i32 [[ADD_1]] -; CHECK-NEXT: [[TMP11:%.*]] = load i8, i8* [[ADD_PTR18_1]], align 1 -; CHECK-NEXT: [[NOT_TOBOOL_1:%.*]] = icmp eq i8 [[TMP11]], 0 +; CHECK-NEXT: [[TMP29:%.*]] = load i8, i8* [[ADD_PTR18_1]], align 1 +; CHECK-NEXT: [[NOT_TOBOOL_1:%.*]] = icmp eq i8 [[TMP29]], 0 ; CHECK-NEXT: [[CONV21_1:%.*]] = zext i1 [[NOT_TOBOOL_1]] to i8 ; CHECK-NEXT: store i8 [[CONV21_1]], i8* [[ADD_PTR18_1]], align 1 ; CHECK-NEXT: [[ADD_PTR23_1]] = getelementptr inbounds i8, i8* [[ADD_PTR23]], i32 [[TMP1]] Index: llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll +++ llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll @@ -12,39 +12,45 @@ ; SSE: for.body: ; SSE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; SSE-NEXT: [[ACC1_056:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD13:%.*]], [[FOR_BODY]] ] -; SSE-NEXT: [[S1_055:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[COND_I40:%.*]], [[FOR_BODY]] ] -; SSE-NEXT: [[S0_054:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[COND_I44:%.*]], [[FOR_BODY]] ] +; SSE-NEXT: [[TMP0:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP23:%.*]], [[FOR_BODY]] ] ; SSE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 [[INDVARS_IV]] -; SSE-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; SSE-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4 ; SSE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; SSE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[DEST:%.*]], i64 [[INDVARS_IV]] ; SSE-NEXT: store float [[ACC1_056]], float* [[ARRAYIDX2]], align 4 -; SSE-NEXT: [[ADD:%.*]] = fadd float [[S0_054]], [[TMP0]] -; SSE-NEXT: [[ADD3:%.*]] = fadd float [[S1_055]], [[TMP0]] -; SSE-NEXT: [[MUL:%.*]] = fmul float [[S0_054]], 0.000000e+00 -; SSE-NEXT: [[ADD4:%.*]] = fadd float [[MUL]], [[ADD3]] -; SSE-NEXT: [[MUL5:%.*]] = fmul float [[S1_055]], 0.000000e+00 -; SSE-NEXT: [[ADD6:%.*]] = fadd float [[MUL5]], [[ADD]] -; SSE-NEXT: [[CMP_I:%.*]] = fcmp olt float [[ADD6]], 1.000000e+00 -; SSE-NEXT: [[COND_I:%.*]] = select i1 [[CMP_I]], float [[ADD6]], float 1.000000e+00 -; SSE-NEXT: [[CMP_I51:%.*]] = fcmp olt float [[COND_I]], -1.000000e+00 -; SSE-NEXT: [[CMP_I49:%.*]] = fcmp olt float [[ADD4]], 1.000000e+00 -; SSE-NEXT: [[COND_I50:%.*]] = select i1 [[CMP_I49]], float [[ADD4]], float 1.000000e+00 -; SSE-NEXT: [[CMP_I47:%.*]] = fcmp olt float [[COND_I50]], -1.000000e+00 -; SSE-NEXT: [[COND_I_OP:%.*]] = fmul float [[COND_I]], 0.000000e+00 -; SSE-NEXT: [[MUL10:%.*]] = select i1 [[CMP_I51]], float -0.000000e+00, float [[COND_I_OP]] -; SSE-NEXT: [[COND_I50_OP:%.*]] = fmul float [[COND_I50]], 0.000000e+00 -; SSE-NEXT: [[MUL11:%.*]] = select i1 [[CMP_I47]], float -0.000000e+00, float [[COND_I50_OP]] -; SSE-NEXT: [[ADD13]] = fadd float [[MUL10]], [[MUL11]] +; SSE-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> undef, <2 x i32> +; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0 +; SSE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP1]], i32 1 +; SSE-NEXT: [[TMP4:%.*]] = fadd <2 x float> [[REORDER_SHUFFLE]], [[TMP3]] +; SSE-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP0]], zeroinitializer +; SSE-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1 +; SSE-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP4]], i32 0 +; SSE-NEXT: [[TMP8:%.*]] = insertelement <2 x float> undef, float [[TMP7]], i32 0 +; SSE-NEXT: [[TMP9:%.*]] = insertelement <2 x float> [[TMP8]], float [[TMP6]], i32 1 +; SSE-NEXT: [[TMP10:%.*]] = fadd <2 x float> [[TMP5]], [[TMP9]] +; SSE-NEXT: [[TMP11:%.*]] = fcmp olt <2 x float> [[TMP10]], +; SSE-NEXT: [[TMP12:%.*]] = select <2 x i1> [[TMP11]], <2 x float> [[TMP10]], <2 x float> +; SSE-NEXT: [[TMP13:%.*]] = fcmp olt <2 x float> [[TMP12]], +; SSE-NEXT: [[TMP14:%.*]] = fmul <2 x float> [[TMP12]], zeroinitializer +; SSE-NEXT: [[TMP15:%.*]] = extractelement <2 x float> [[TMP14]], i32 1 +; SSE-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[TMP14]], i32 0 +; SSE-NEXT: [[TMP17:%.*]] = insertelement <2 x float> undef, float [[TMP16]], i32 0 +; SSE-NEXT: [[TMP18:%.*]] = insertelement <2 x float> [[TMP17]], float [[TMP15]], i32 1 +; SSE-NEXT: [[TMP19:%.*]] = select <2 x i1> [[TMP13]], <2 x float> , <2 x float> [[TMP18]] +; SSE-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[TMP19]], i32 0 +; SSE-NEXT: [[TMP21:%.*]] = extractelement <2 x float> [[TMP19]], i32 1 +; SSE-NEXT: [[ADD13]] = fadd float [[TMP20]], [[TMP21]] ; SSE-NEXT: [[CMP_I45:%.*]] = fcmp olt float [[ADD13]], 1.000000e+00 ; SSE-NEXT: [[COND_I46:%.*]] = select i1 [[CMP_I45]], float [[ADD13]], float 1.000000e+00 ; SSE-NEXT: [[CMP_I43:%.*]] = fcmp olt float [[COND_I46]], -1.000000e+00 -; SSE-NEXT: [[COND_I44]] = select i1 [[CMP_I43]], float -1.000000e+00, float [[COND_I46]] -; SSE-NEXT: [[CMP_I41:%.*]] = fcmp olt float [[MUL11]], 1.000000e+00 -; SSE-NEXT: [[COND_I42:%.*]] = select i1 [[CMP_I41]], float [[MUL11]], float 1.000000e+00 +; SSE-NEXT: [[COND_I44:%.*]] = select i1 [[CMP_I43]], float -1.000000e+00, float [[COND_I46]] +; SSE-NEXT: [[CMP_I41:%.*]] = fcmp olt float [[TMP21]], 1.000000e+00 +; SSE-NEXT: [[COND_I42:%.*]] = select i1 [[CMP_I41]], float [[TMP21]], float 1.000000e+00 ; SSE-NEXT: [[CMP_I39:%.*]] = fcmp olt float [[COND_I42]], -1.000000e+00 -; SSE-NEXT: [[COND_I40]] = select i1 [[CMP_I39]], float -1.000000e+00, float [[COND_I42]] +; SSE-NEXT: [[COND_I40:%.*]] = select i1 [[CMP_I39]], float -1.000000e+00, float [[COND_I42]] ; SSE-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 32 +; SSE-NEXT: [[TMP22:%.*]] = insertelement <2 x float> undef, float [[COND_I40]], i32 0 +; SSE-NEXT: [[TMP23]] = insertelement <2 x float> [[TMP22]], float [[COND_I44]], i32 1 ; SSE-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] ; SSE: for.end: ; SSE-NEXT: ret void Index: llvm/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll +++ llvm/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll @@ -15,18 +15,20 @@ ; CHECK: for.body6: ; CHECK-NEXT: br label [[FOR_BODY12:%.*]] ; CHECK: for.body12: -; CHECK-NEXT: [[FZIMG_069:%.*]] = phi double [ undef, [[FOR_BODY6]] ], [ [[ADD19:%.*]], [[IF_END:%.*]] ] -; CHECK-NEXT: [[FZREAL_068:%.*]] = phi double [ undef, [[FOR_BODY6]] ], [ [[ADD20:%.*]], [[IF_END]] ] -; CHECK-NEXT: [[MUL13:%.*]] = fmul double [[FZREAL_068]], [[FZREAL_068]] -; CHECK-NEXT: [[MUL14:%.*]] = fmul double [[FZIMG_069]], [[FZIMG_069]] -; CHECK-NEXT: [[ADD15:%.*]] = fadd double [[MUL13]], [[MUL14]] +; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x double> [ undef, [[FOR_BODY6]] ], [ [[TMP7:%.*]], [[IF_END:%.*]] ] +; CHECK-NEXT: [[TMP1:%.*]] = fmul <2 x double> [[TMP0]], [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 +; CHECK-NEXT: [[ADD15:%.*]] = fadd double [[TMP2]], [[TMP3]] ; CHECK-NEXT: [[CMP16:%.*]] = fcmp ogt double [[ADD15]], 4.000000e+00 ; CHECK-NEXT: br i1 [[CMP16]], label [[FOR_INC21:%.*]], label [[IF_END]] ; CHECK: if.end: -; CHECK-NEXT: [[MUL18:%.*]] = fmul double undef, [[FZIMG_069]] -; CHECK-NEXT: [[ADD19]] = fadd double undef, [[MUL18]] -; CHECK-NEXT: [[SUB:%.*]] = fsub double [[MUL13]], [[MUL14]] -; CHECK-NEXT: [[ADD20]] = fadd double undef, [[SUB]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP0]], i32 1 +; CHECK-NEXT: [[MUL18:%.*]] = fmul double undef, [[TMP4]] +; CHECK-NEXT: [[SUB:%.*]] = fsub double [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> undef, double [[SUB]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[MUL18]], i32 1 +; CHECK-NEXT: [[TMP7]] = fadd <2 x double> undef, [[TMP6]] ; CHECK-NEXT: br i1 undef, label [[FOR_BODY12]], label [[FOR_INC21]] ; CHECK: for.inc21: ; CHECK-NEXT: br i1 undef, label [[FOR_END23:%.*]], label [[FOR_BODY6]] Index: llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll +++ llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll @@ -67,13 +67,14 @@ ; CHECK-NEXT: [[ARRAYIDX2_6:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 0 ; CHECK-NEXT: [[TMP10:%.*]] = bitcast i64* [[ARRAYIDX2_6]] to <2 x i64>* ; CHECK-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP10]], align 1 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x i64> undef, i64 [[TMP11]], i32 0 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i64> [[TMP12]], i64 [[TMP5]], i32 1 -; CHECK-NEXT: [[TMP14:%.*]] = lshr <2 x i64> [[TMP13]], -; CHECK-NEXT: [[TMP15:%.*]] = add nuw nsw <2 x i64> [[TMP9]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = bitcast i64* [[ARRAYIDX2_2]] to <2 x i64>* -; CHECK-NEXT: store <2 x i64> [[TMP15]], <2 x i64>* [[TMP16]], align 1 +; CHECK-NEXT: [[TMP11:%.*]] = lshr <2 x i64> [[TMP4]], +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x i64> undef, i64 [[TMP13]], i32 0 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x i64> [[TMP14]], i64 [[TMP12]], i32 1 +; CHECK-NEXT: [[TMP16:%.*]] = add nuw nsw <2 x i64> [[TMP9]], [[TMP15]] +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i64* [[ARRAYIDX2_2]] to <2 x i64>* +; CHECK-NEXT: store <2 x i64> [[TMP16]], <2 x i64>* [[TMP17]], align 1 ; CHECK-NEXT: ret void ; entry: Index: llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll +++ llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll @@ -5,18 +5,20 @@ ; CHECK-LABEL: @rftbsub( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 2 -; CHECK-NEXT: [[TMP0:%.*]] = load double, double* [[ARRAYIDX6]], align 8 -; CHECK-NEXT: [[TMP1:%.*]] = or i64 2, 1 -; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP2:%.*]] = load double, double* [[ARRAYIDX12]], align 8 -; CHECK-NEXT: [[ADD16:%.*]] = fadd double [[TMP2]], undef +; CHECK-NEXT: [[TMP0:%.*]] = or i64 2, 1 +; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[ARRAYIDX6]] to <2 x double>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 1 +; CHECK-NEXT: [[ADD16:%.*]] = fadd double [[TMP3]], undef ; CHECK-NEXT: [[MUL18:%.*]] = fmul double undef, [[ADD16]] ; CHECK-NEXT: [[ADD19:%.*]] = fadd double undef, [[MUL18]] ; CHECK-NEXT: [[SUB22:%.*]] = fsub double undef, undef -; CHECK-NEXT: [[SUB25:%.*]] = fsub double [[TMP0]], [[ADD19]] -; CHECK-NEXT: store double [[SUB25]], double* [[ARRAYIDX6]], align 8 -; CHECK-NEXT: [[SUB29:%.*]] = fsub double [[TMP2]], [[SUB22]] -; CHECK-NEXT: store double [[SUB29]], double* [[ARRAYIDX12]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> undef, double [[ADD19]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[SUB22]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[ARRAYIDX6]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP6]], <2 x double>* [[TMP7]], align 8 ; CHECK-NEXT: unreachable ; entry: