Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -114,6 +114,15 @@ ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions")); +static cl::opt +SLPThrottling("slp-throttling", cl::init(true), cl::Hidden, + cl::desc("Enable tree partial vectorize with throttling")); + +static cl::opt +MaxCostsRecalculations("slp-throttling-budget", cl::init(500), cl::Hidden, + cl::desc("Limit the total number of nodes for cost " + "recalculations during throttling")); + static cl::opt ShouldStartVectorizeHorAtStore( "slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc( @@ -524,11 +533,35 @@ /// \returns the cost incurred by unwanted spills and fills, caused by /// holding live values over call sites. - int getSpillCost() const; + int getSpillCost(const SmallPtrSetImpl &ScalarsToVec) const; + + /// \returns the cost extracting vectorized elements. + int getExtractCost(const SmallPtrSetImpl &ScalarsToVec); + + /// \returns the cost of gathering canceled elements to be used + /// by vectorized operations during throtelling. + int getInsertCost(const SmallPtrSetImpl &VecToScalars); + + /// Determine the cost of partail vectorization. + int getSubTreeCost(ArrayRef VecNodes); + + /// Does a BFS for non-gathering nodes of SLP graph. + void BFS(); + + /// Find the subgraph of the graph that is profitable to vectorize. + int findSubTree(); + + /// Cut the branch untill it is profitable to vectorize. + unsigned cutBranch(unsigned LeafIdx, unsigned L, + ArrayRef VecNodes, int &Cost); + + /// Try to extend the subgraph that makes sense to vectorize from + /// a cost perspective. + int extendSubTree(unsigned LeafIdx, ArrayRef VecNodes); /// \returns the vectorization cost of the subtree that starts at \p VL. /// A negative number means that this is profitable. - int getTreeCost(); + int getTreeCost(bool cutTree = false); /// Construct a vectorizable tree that starts at \p Roots, ignoring users for /// the purpose of scheduling and extraction in the \p UserIgnoreLst. @@ -549,6 +582,8 @@ ScalarToTreeEntry.clear(); MustGather.clear(); ExternalUses.clear(); + InternalTreeUses.clear(); + RemovedOperations.clear(); NumOpsWantToKeepOrder.clear(); NumOpsWantToKeepOriginalOrder = 0; for (auto &Iter : BlocksSchedules) { @@ -556,6 +591,11 @@ BS->clear(); } MinBWs.clear(); + UseTreeIndices.clear(); + Nodes.clear(); + NodesList.clear(); + EntriesCosts.clear(); + CostsRecalculations = 0; } unsigned getTreeSize() const { return VectorizableTree.size(); } @@ -599,6 +639,9 @@ return MinVecRegSize; } + /// Save seed instructions to try partially vectorize later. + void recordSeeds(ArrayRef Ops); + /// Check if ArrayType or StructType is isomorphic to some VectorType. /// /// \returns number of elements in vector if isomorphism exists, 0 otherwise. @@ -608,6 +651,12 @@ /// vectorizable. We do not vectorize such trees. bool isTreeTinyAndNotFullyVectorizable() const; + /// Cut the tree to make it partially vectorizable. + void cutTree(ArrayRef VecNodes); + + /// Try partially vectorize the tree via throttling. + bool tryPartialVectorization(); + OptimizationRemarkEmitter *getORE() { return ORE; } /// This structure holds any data we need about the edges being traversed @@ -1315,6 +1364,9 @@ /// Maps a specific scalar to its tree entry. SmallDenseMap ScalarToTreeEntry; + /// Tree entries that should not be vectorized due to throttling. + SmallVector RemovedOperations; + /// A list of scalars that we found that we need to keep as scalars. ValueSet MustGather; @@ -1334,6 +1386,25 @@ }; using UserList = SmallVector; + struct Node { + Node(unsigned I) : Idx(I) {} + + /// Index of this node in the graph. + unsigned Idx = 0; + + /// A previous node along the way to this node. + unsigned Prev = 0; + + /// Color of this node. + unsigned Color = 0; + + /// True if this node is a leaf in the graph. + bool IsLeaf = false; + }; + + /// \Returns the cost of extracting the vectorized elements. + int getExtractOperationCost(ExternalUser &EU); + /// Checks if two instructions may access the same memory. /// /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it @@ -1385,6 +1456,26 @@ /// after vectorization. UserList ExternalUses; + /// The array of non-gathering nodes of the graph + SmallVector, 8> NodesList; + + /// The array of costs for each tree entry. + SmallVector EntriesCosts; + + /// Map non-gathering nodes. + DenseMap Nodes; + + /// List of all seeds instructions that were not vectorized, that + /// we will try to vectorize with patial vectorization. + SmallVector>, 8> Seeds; + + /// Number of times in nodes that we already recalulated cost of + /// the subgraph during throtteling. + int CostsRecalculations = 0; + + /// Internal tree proposed to vectorized values use in that tree. + SmallDenseMap InternalTreeUses; + /// Values used only by @llvm.assume calls. SmallPtrSet EphValues; @@ -1394,6 +1485,9 @@ /// A list of blocks that we are going to CSE. SetVector CSEBlocks; + /// The index containing the use of this entry by other entries. + SmallDenseMap> UseTreeIndices; + /// Contains all scheduling relevant data for an instruction. /// A ScheduleData either represents a single instruction or a member of an /// instruction bundle (= a group of instructions which is combined into a @@ -1743,6 +1837,9 @@ /// Attaches the BlockScheduling structures to basic blocks. MapVector> BlocksSchedules; + /// Remove operations from the list of proposed to schedule. + void removeFromScheduling(BlockScheduling *BS); + /// Performs the "real" scheduling. Done before vectorization is actually /// performed in a basic block. void scheduleBlock(BlockScheduling *BS); @@ -1948,6 +2045,7 @@ LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U << ".\n"); assert(!UseEntry->NeedToGather && "Bad state"); + InternalTreeUses[U].push_back(ExternalUser(Scalar, U, FoundLane)); continue; } } @@ -2566,6 +2664,91 @@ } } +void BoUpSLP::cutTree(ArrayRef VecNodes) { + SmallPtrSet Removed; + // Canceling unprofitable elements. + for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) { + TreeEntry *Entry = &VectorizableTree[I]; + if (Entry->NeedToGather) + continue; + if (!is_contained(VecNodes, I)) { + Entry->NeedToGather = true; + for (Value *V : Entry->Scalars) { + LLVM_DEBUG(dbgs() << "SLP: Remove scalar " << *V + << " out of proposed to vectorize.\n"); + ScalarToTreeEntry.erase(V); + Removed.insert(V); + RemovedOperations.push_back(I); + MustGather.insert(V); + ExternalUses.erase( + std::remove_if(ExternalUses.begin(), ExternalUses.end(), + [&](ExternalUser &EU) { return EU.Scalar == V; }), + ExternalUses.end()); + } + } + } + // For all canceled operations we should consider the possibility of + // use by with non-canceled operations and for that, it requires + // to populate ExternalUser list with canceled elements. + for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) { + TreeEntry *Entry = &VectorizableTree[I]; + if (Entry->NeedToGather) + continue; + if (is_contained(VecNodes, I)) { + for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { + Value *Scalar = Entry->Scalars[Lane]; + for (User *U : Scalar->users()) { + LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n"); + Instruction *UserInst = dyn_cast(U); + if (!UserInst) + continue; + if (!Removed.count(U)) + continue; + // Ignore users in the user ignore list. + if (is_contained(UserIgnoreList, UserInst)) + continue; + LLVM_DEBUG(dbgs() << "SLP: Need to extract canceled operation :" + << *U << " from lane " << Lane + << " from " << *Scalar << ".\n"); + ExternalUses.push_back(ExternalUser(Scalar, U, Lane)); + } + } + } + } +} + +bool BoUpSLP::tryPartialVectorization() { + bool Changed = false; + for (unsigned I = 0; I < Seeds.size(); ++I) { + auto &S = *Seeds[I].get(); + // Check those seed instructions are still alive. + if (std::any_of(S.begin(), S.end(), [](Value *V) { + return (!(cast(V))->getOperand(0)); + })) + continue; + + buildTree(S); + + // If other part BB were vectorized the tree might not be + // enough interest to look. + if (isTreeTinyAndNotFullyVectorizable()) + continue; + + int Cost = getTreeCost(true); + if (Cost < -SLPCostThreshold) { + vectorizeTree(); + Changed = true; + } + } + Seeds.clear(); + return Changed; +} + +void BoUpSLP::recordSeeds(ArrayRef Ops) { + Seeds.push_back( + llvm::make_unique>(Ops.begin(), Ops.end())); +} + unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const { unsigned N; Type *EltTy; @@ -3073,7 +3256,7 @@ return true; } -int BoUpSLP::getSpillCost() const { +int BoUpSLP::getSpillCost(const SmallPtrSetImpl &ScalarsToVec) const { // Walk from the bottom of the tree to the top, tracking which values are // live. When we see a call instruction that is not part of our tree, // query TTI to see if there is a cost to keeping values live over it @@ -3097,7 +3280,7 @@ // Update LiveValues. LiveValues.erase(PrevInst); for (auto &J : PrevInst->operands()) { - if (isa(&*J) && getTreeEntry(&*J)) + if (isa(&*J) && ScalarsToVec.count(&*J)) LiveValues.insert(cast(&*J)); } @@ -3138,15 +3321,302 @@ return Cost; } -int BoUpSLP::getTreeCost() { +int BoUpSLP::getExtractOperationCost(ExternalUser &EU) { + unsigned BundleWidth = VectorizableTree.front().Scalars.size(); + + // Uses by ephemeral values are free (because the ephemeral value will be + // removed prior to code generation, and so the extraction will be + // removed as well). + if (EphValues.count(EU.User)) + return 0; + + // If we plan to rewrite the tree in a smaller type, we will need to sign + // extend the extracted value back to the original type. Here, we account + // for the extract and the added cost of the sign extend if needed. + auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth); + auto *ScalarRoot = VectorizableTree[0].Scalars[0]; + + if (MinBWs.count(ScalarRoot)) { + auto *MinTy = IntegerType::get(F->getContext(), + MinBWs[ScalarRoot].first); + auto Extend = + MinBWs[ScalarRoot].second ? + Instruction::SExt : Instruction::ZExt; + VecTy = VectorType::get(MinTy, BundleWidth); + return (TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(), + VecTy, EU.Lane)); + } + return TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane); +} + +int BoUpSLP::getExtractCost(const SmallPtrSetImpl &ScalarsToVec) { + int ExtractCost = 0; + SmallPtrSet ExtractCostCalculated; + for (ExternalUser &EU : ExternalUses) { + // We only add extract cost once for the same scalar. + if (!ExtractCostCalculated.insert(EU.Scalar).second) + continue; + + // Avoid non-vectorized scalars. + if (!ScalarsToVec.count(EU.Scalar)) { + // Consider the possibility of extracting vectorized + // values for canceled elements use. + if (InternalTreeUses.find(EU.Scalar) != InternalTreeUses.end()) + for (ExternalUser &IU : InternalTreeUses[EU.Scalar]) + ExtractCost += getExtractOperationCost(IU); + continue; + } + + ExtractCost += getExtractOperationCost(EU); + } + return ExtractCost; +} + +int BoUpSLP::getInsertCost(const SmallPtrSetImpl &VecToScalars) { + int InsertCost = 0; + for (unsigned I = 0; I < VectorizableTree.size(); I++) { + TreeEntry *Entry = &VectorizableTree[I]; + if (Entry->NeedToGather) + continue; + for (Value *V : Entry->Scalars) { + auto *Inst = cast(V); + for (Use &U : Inst->operands()) { + Value *Op = U.get(); + if (VecToScalars.count(Op)) + InsertCost += getGatherCost(Op); + } + } + } + return InsertCost; +} + +int BoUpSLP::getSubTreeCost(ArrayRef VecNodes) { int Cost = 0; - LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size " - << VectorizableTree.size() << ".\n"); + int ExtractCost = 0; + int SpillCost = 0; + SmallPtrSet ScalarsToVec; + SmallPtrSet VecToScalars; + +#ifndef NDEBUG + for (unsigned N : VecNodes) { + if (!N) + continue; + do { + N = Nodes[N]->Prev; + assert(is_contained(VecNodes, N) && + "An incorrect gap in indices according to the tree structure"); + } while (N != 0); + } +#endif + + CostsRecalculations += VectorizableTree.size(); + + for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) { + TreeEntry *Entry = &VectorizableTree[I]; + if (Entry->NeedToGather) + continue; + if (is_contained(VecNodes, I)) { + for (Value *V : Entry->Scalars) + ScalarsToVec.insert(V); + int GatherCost = 0; + auto It = UseTreeIndices.find(I); + if (It != UseTreeIndices.end()) + for (int Gather : It->second) + if (VectorizableTree[Gather].NeedToGather) + GatherCost += EntriesCosts[Gather]; + Cost += EntriesCosts[I] + GatherCost; + } else { + for (Value *V : Entry->Scalars) + VecToScalars.insert(V); + } + } + + int Inserts = getInsertCost(VecToScalars); + Cost += Inserts; + ExtractCost = getExtractCost(ScalarsToVec); + Cost += ExtractCost; + SpillCost = getSpillCost(ScalarsToVec); + Cost += SpillCost; + + return Cost; +} + +void BoUpSLP::BFS() { + std::vector Worklist; + Worklist.push_back(0); + Nodes[0]->Color = 1; + do { + unsigned Current = Worklist.back(); + Worklist.pop_back(); + bool isLeaf = true; + for (unsigned Next : UseTreeIndices[Current]) { + if (Next == Current) + continue; + if (VectorizableTree[Next].NeedToGather) + continue; + isLeaf = false; + // To simplify the algorithm, consider + // the top node of a cycle as a leaf. + if (Nodes[Next]->Color != 0) { + isLeaf = true; + } else { + Worklist.push_back(Next); + Nodes[Next]->Color = 1; + Nodes[Next]->Prev = Current; + } + } + if (isLeaf && Current) + Nodes[Current]->IsLeaf = true; + } while (!Worklist.empty()); +} + +unsigned BoUpSLP::cutBranch(unsigned LeafIdx, unsigned L, + ArrayRef VecNodes, int &Cost) { + SmallVector Vecs; + SmallVector Path; + + for (unsigned V : VecNodes) + Vecs.push_back(V); + assert(Nodes[LeafIdx]->IsLeaf && "This suppose to be a leaf node"); + assert(Nodes[L]->IsLeaf && "This suppose to be a leaf node"); + + unsigned I = L; + unsigned RealOpAt = UINT_MAX; + do { + if (Nodes[I]->Color == LeafIdx) + break; + Path.push_back(I); + I = Nodes[I]->Prev; + } while (I != 0); + if (I == 0) + Path.push_back(0); + // Find the first entry with worth operations to vectorize. + for (I = Path.size(); I--;) { + unsigned Idx = Nodes[Path[I]]->Idx; + InstructionsState S = getSameOpcode(VectorizableTree[Idx].Scalars); + auto *Inst = S.MainOp; + if (Inst && (isa(Inst) || isa(Inst) || + isa(Inst))) { + RealOpAt = I; + break; + } + } + if (RealOpAt == UINT_MAX) + return UINT_MAX; + for (I = 0; I < Path.size(); I++) { + SmallVector SubTree(Vecs); + SubTree.insert(SubTree.end(), Path.begin() + I, Path.end()); + if (MaxCostsRecalculations < CostsRecalculations || SubTree.size() <= 1 || + I > RealOpAt) + break; + Cost = getSubTreeCost(SubTree); + if (Cost < -SLPCostThreshold) + return Path[I]; + } + return UINT_MAX; +} + +int BoUpSLP::extendSubTree(unsigned LeafIdx, ArrayRef VecNodes) { + int SubTreeCost = 0; + SmallVector Vecs; + SmallVector Path; + for (unsigned V : VecNodes) { + Vecs.push_back(V); + Path.push_back(V); + } + assert(Nodes[LeafIdx]->IsLeaf && "This suppose to be a leaf node"); + + // Add leaves from the graph along the way to the bottom of + // the graph if profitable. + for (auto &N : Nodes) { + unsigned L = N.first; + if (Nodes[L]->Color == LeafIdx || !Nodes[L]->IsLeaf) + continue; + assert(Nodes[L]->IsLeaf && "This suppose to be a leaf node"); + int Cost; + unsigned Node = cutBranch(LeafIdx, L, Vecs, Cost); + if (Node != UINT_MAX) { + SmallVector BranchPath; + unsigned I = Node; + do { + if (Nodes[I]->Color == LeafIdx) + break; + BranchPath.push_back(I); + I = Nodes[I]->Prev; + } while (I != 0); + Vecs.insert(Vecs.end(), BranchPath.begin(), BranchPath.end()); + for (unsigned V : BranchPath) + Nodes[V]->Color = LeafIdx; + SubTreeCost = Cost; + } + } - unsigned BundleWidth = VectorizableTree[0].Scalars.size(); + if (SubTreeCost < -SLPCostThreshold) { + assert(Vecs.size() < NodesList.size() && + "Incorrect number to vectorize nodes"); + LLVM_DEBUG(dbgs() << "SLP: Reduced the number of elements to " + << Vecs.size() << " in the tree with " << SubTreeCost + << " cost to make it partially vectorizable.\n"); + cutTree(Vecs); + return SubTreeCost; + } + return INT_MAX; +} +int BoUpSLP::findSubTree() { + for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) { + if (VectorizableTree[I].NeedToGather) + continue; + NodesList.push_back(llvm::make_unique(I)); + Nodes[I] = NodesList.back().get(); + assert(I == Nodes[I]->Idx && "Incorrect node index"); + } + BFS(); + + // Walk by graph leaves and reduce a branch height + // to find any profitable part. + for (auto &N : Nodes) { + if (!N.second->IsLeaf) + continue; + unsigned L = N.second->Idx; + SmallVector Path; + int Cost; + unsigned I = cutBranch(L, L, Path, Cost); + if (I != UINT_MAX) { + do { + Path.push_back(I); + Nodes[I]->Color = L; + I = Nodes[I]->Prev; + } while (I != 0); + if (Path.back() != 0) { + Nodes[0]->Color = L; + Path.push_back(0); + } + // Try to extend the already profitable part to maximize + // the number of operations to vectorize. + int CostWithLeafs = extendSubTree(L, Path); + if (CostWithLeafs == INT_MAX) { + LLVM_DEBUG(dbgs() << "SLP: Reduced the number of elements to " + << Path.size() << " in the tree with " << Cost + << " cost to make it partially vectorizable.\n"); + cutTree(Path); + return Cost; + } + return CostWithLeafs; + } + } + return INT_MAX; +} + +int BoUpSLP::getTreeCost(bool cutTree) { + SmallPtrSet ScalarsToVec; + SmallPtrSet VecToScalars; + int CostSum = 0; + LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size " + << VectorizableTree.size() << ".\n"); for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) { TreeEntry &TE = VectorizableTree[I]; + EntriesCosts.push_back(0); // We create duplicate tree entries for gather sequences that have multiple // uses. However, we should not compute the cost of duplicate sequences. @@ -3167,56 +3637,47 @@ })) continue; - int C = getEntryCost(&TE); - LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C + for (auto &UseTreeIdx : TE.UserTreeIndices) + UseTreeIndices[UseTreeIdx.Idx].push_back(I); + + if (!TE.NeedToGather) { + for (Value *V : TE.Scalars) + ScalarsToVec.insert(V); + } + + EntriesCosts[I] = getEntryCost(&TE); + LLVM_DEBUG(dbgs() << "SLP: Adding cost " << EntriesCosts[I] << " for bundle that starts with " << *TE.Scalars[0] << ".\n"); - Cost += C; + + CostSum += EntriesCosts[I]; } - SmallPtrSet ExtractCostCalculated; int ExtractCost = 0; - for (ExternalUser &EU : ExternalUses) { - // We only add extract cost once for the same scalar. - if (!ExtractCostCalculated.insert(EU.Scalar).second) - continue; - - // Uses by ephemeral values are free (because the ephemeral value will be - // removed prior to code generation, and so the extraction will be - // removed as well). - if (EphValues.count(EU.User)) - continue; + int SpillCost = 0; + int Cost = CostSum; - // If we plan to rewrite the tree in a smaller type, we will need to sign - // extend the extracted value back to the original type. Here, we account - // for the extract and the added cost of the sign extend if needed. - auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth); - auto *ScalarRoot = VectorizableTree[0].Scalars[0]; - if (MinBWs.count(ScalarRoot)) { - auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first); - auto Extend = - MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt; - VecTy = VectorType::get(MinTy, BundleWidth); - ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(), - VecTy, EU.Lane); - } else { - ExtractCost += - TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane); - } - } - - int SpillCost = getSpillCost(); - Cost += SpillCost + ExtractCost; + ExtractCost = getExtractCost(ScalarsToVec); + Cost += ExtractCost; + SpillCost = getSpillCost(ScalarsToVec); + Cost += SpillCost; std::string Str; - { - raw_string_ostream OS(Str); - OS << "SLP: Spill Cost = " << SpillCost << ".\n" - << "SLP: Extract Cost = " << ExtractCost << ".\n" - << "SLP: Total Cost = " << Cost << ".\n"; + if (SLPThrottling && cutTree && Cost >= -SLPCostThreshold) { + Cost = findSubTree(); + { + raw_string_ostream OS(Str); + OS << "SLP: Partial vectorization with Total Cost = " << Cost << ".\n"; + } + } else { + { + raw_string_ostream OS(Str); + OS << "SLP: Spill Cost = " << SpillCost << ".\n" + << "SLP: Extract Cost = " << ExtractCost << ".\n" + << "SLP: Total Cost = " << Cost << ".\n"; + } } LLVM_DEBUG(dbgs() << Str); - if (ViewSLPTree) ViewGraph(this, "SLP" + F->getName(), false, Str); @@ -3935,7 +4396,12 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { // All blocks must be scheduled before any instructions are inserted. for (auto &BSIter : BlocksSchedules) { - scheduleBlock(BSIter.second.get()); + BlockScheduling *BS = BSIter.second.get(); + // Remove all Schedule Data from all nodes that we have changed + // vectorization decision. + if (RemovedOperations.size()) + removeFromScheduling(BS); + scheduleBlock(BS); } Builder.SetInsertPoint(&F->getEntryBlock().front()); @@ -4064,13 +4530,16 @@ Type *Ty = Scalar->getType(); if (!Ty->isVoidTy()) { #ifndef NDEBUG - for (User *U : Scalar->users()) { - LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n"); - - // It is legal to replace users in the ignorelist by undef. - assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) && - "Replacing out-of-tree value with undef"); - } + // The tree might not be fully vectorized, so we don't have to + // check every user. + if (!RemovedOperations.size()) + for (User *U : Scalar->users()) { + LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n"); + + // It is legal to replace users in the ignorelist by undef. + assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) && + "Replacing out-of-tree value with undef"); + } #endif Value *Undef = UndefValue::get(Ty); Scalar->replaceAllUsesWith(Undef); @@ -4547,6 +5016,32 @@ ReadyInsts.clear(); } +void BoUpSLP::removeFromScheduling(BlockScheduling *BS) { + bool Removed = false; + for (int I : RemovedOperations) { + TreeEntry *Entry = &VectorizableTree[I]; + ScheduleData *SD = BS->getScheduleData(Entry->Scalars[0]); + if (SD && SD->isPartOfBundle()) { + if (!Removed) { + Removed = true; + BS->resetSchedule(); + } + BS->cancelScheduling(Entry->Scalars, SD->OpValue); + } + } + if (!Removed) + return; + BS->resetSchedule(); + BS->initialFillReadyList(BS->ReadyInsts); + for (Instruction *I = BS->ScheduleStart; I != BS->ScheduleEnd; + I = I->getNextNode()) { + if (BS->ScheduleDataMap.find(I) == BS->ScheduleDataMap.end()) + continue; + BS->doForAllOpcodes(I, + [&](ScheduleData *SD) { SD->clearDependencies(); }); + } +} + void BoUpSLP::scheduleBlock(BlockScheduling *BS) { if (!BS->ScheduleStart) return; @@ -5015,6 +5510,12 @@ << " underlying objects.\n"); Changed |= vectorizeGEPIndices(BB, R); } + + // Partially vectorize trees after all full vectorization is done, + // otherwise, we could prevent more profitable full vectorization with + // smaller vector sizes. + if (SLPThrottling) + Changed |= R.tryPartialVectorization(); } if (Changed) { @@ -5090,6 +5591,8 @@ // Move to the next bundle. i += VF - 1; Changed = true; + } else { + R.recordSeeds(Operands); } } @@ -5319,6 +5822,8 @@ I += VF - 1; NextInst = I + 1; Changed = true; + } else { + R.recordSeeds(Ops); } } } Index: test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll =================================================================== --- test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll +++ test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll @@ -12,21 +12,18 @@ ; CHECK-NEXT: [[Z0:%.*]] = zext <4 x i16> [[A:%.*]] to <4 x i32> ; CHECK-NEXT: [[Z1:%.*]] = zext <4 x i16> [[B:%.*]] to <4 x i32> ; CHECK-NEXT: [[SUB0:%.*]] = sub <4 x i32> [[Z0]], [[Z1]] -; CHECK-NEXT: [[E0:%.*]] = extractelement <4 x i32> [[SUB0]], i32 0 -; CHECK-NEXT: [[S0:%.*]] = sext i32 [[E0]] to i64 -; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds i64, i64* [[P:%.*]], i64 [[S0]] +; CHECK-NEXT: [[TMP0:%.*]] = sext <4 x i32> [[SUB0]] to <4 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0 +; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds i64, i64* [[P:%.*]], i64 [[TMP1]] ; CHECK-NEXT: [[LOAD0:%.*]] = load i64, i64* [[GEP0]] -; CHECK-NEXT: [[E1:%.*]] = extractelement <4 x i32> [[SUB0]], i32 1 -; CHECK-NEXT: [[S1:%.*]] = sext i32 [[E1]] to i64 -; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[S1]] +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i64> [[TMP0]], i32 1 +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[TMP2]] ; CHECK-NEXT: [[LOAD1:%.*]] = load i64, i64* [[GEP1]] -; CHECK-NEXT: [[E2:%.*]] = extractelement <4 x i32> [[SUB0]], i32 2 -; CHECK-NEXT: [[S2:%.*]] = sext i32 [[E2]] to i64 -; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[S2]] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i64> [[TMP0]], i32 2 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[TMP3]] ; CHECK-NEXT: [[LOAD2:%.*]] = load i64, i64* [[GEP2]] -; CHECK-NEXT: [[E3:%.*]] = extractelement <4 x i32> [[SUB0]], i32 3 -; CHECK-NEXT: [[S3:%.*]] = sext i32 [[E3]] to i64 -; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[S3]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP0]], i32 3 +; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[TMP4]] ; CHECK-NEXT: [[LOAD3:%.*]] = load i64, i64* [[GEP3]] ; CHECK-NEXT: call void @foo(i64 [[LOAD0]], i64 [[LOAD1]], i64 [[LOAD2]], i64 [[LOAD3]]) ; CHECK-NEXT: ret void Index: test/Transforms/SLPVectorizer/AArch64/horizontal.ll =================================================================== --- test/Transforms/SLPVectorizer/AArch64/horizontal.ll +++ test/Transforms/SLPVectorizer/AArch64/horizontal.ll @@ -28,39 +28,42 @@ ; CHECK-NEXT: [[IDX_EXT:%.*]] = sext i32 [[LX:%.*]] to i64 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[S_026:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[J_025:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[P2_024:%.*]] = phi i32* [ [[BLK2:%.*]], [[FOR_BODY_LR_PH]] ], [ [[ADD_PTR29:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[P1_023:%.*]] = phi i32* [ [[BLK1:%.*]], [[FOR_BODY_LR_PH]] ], [ [[ADD_PTR:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i32> [ zeroinitializer, [[FOR_BODY_LR_PH]] ], [ [[TMP13:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[P1_023]], i64 1 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[P2_024]], i64 1 ; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[P1_023]], i64 2 ; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[P2_024]], i64 2 ; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, i32* [[P1_023]], i64 3 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[P1_023]] to <4 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[P1_023]] to <4 x i32>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 ; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[P2_024]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[P2_024]] to <4 x i32>* -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = sub nsw <4 x i32> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = icmp slt <4 x i32> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <4 x i32> zeroinitializer, [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> [[TMP4]] -; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 undef, [[S_026]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[P2_024]] to <4 x i32>* +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = sub nsw <4 x i32> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp slt <4 x i32> [[TMP5]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = sub nsw <4 x i32> zeroinitializer, [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = select <4 x i1> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> [[TMP5]] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[TMP0]], i32 0 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 undef, [[TMP9]] ; CHECK-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD]], undef ; CHECK-NEXT: [[ADD19:%.*]] = add nsw i32 [[ADD11]], undef -; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> [[TMP7]]) -; CHECK-NEXT: [[OP_EXTRA]] = add nsw i32 [[TMP8]], [[S_026]] +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> [[TMP8]]) ; CHECK-NEXT: [[ADD27:%.*]] = add nsw i32 [[ADD19]], undef ; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds i32, i32* [[P1_023]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR29]] = getelementptr inbounds i32, i32* [[P2_024]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[J_025]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[H]] +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i32> undef, i32 [[TMP10]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x i32> [[TMP11]], i32 1, i32 1 +; CHECK-NEXT: [[TMP13]] = add nsw <2 x i32> [[TMP12]], [[TMP0]] +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i32> [[TMP13]], i32 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[TMP14]], [[H]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]] ; CHECK: for.end.loopexit: +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i32> [[TMP13]], i32 0 ; CHECK-NEXT: br label [[FOR_END]] ; CHECK: for.end: -; CHECK-NEXT: [[S_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_EXTRA]], [[FOR_END_LOOPEXIT]] ] +; CHECK-NEXT: [[S_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP15]], [[FOR_END_LOOPEXIT]] ] ; CHECK-NEXT: ret i32 [[S_0_LCSSA]] ; entry: Index: test/Transforms/SLPVectorizer/AArch64/transpose.ll =================================================================== --- test/Transforms/SLPVectorizer/AArch64/transpose.ll +++ test/Transforms/SLPVectorizer/AArch64/transpose.ll @@ -39,7 +39,6 @@ ; CHECK-LABEL: @store_chain_v2i64( ; CHECK-NEXT: [[A_1:%.*]] = getelementptr i64, i64* [[A:%.*]], i64 1 ; CHECK-NEXT: [[B_1:%.*]] = getelementptr i64, i64* [[B:%.*]], i64 1 -; CHECK-NEXT: [[C_1:%.*]] = getelementptr i64, i64* [[C:%.*]], i64 1 ; CHECK-NEXT: [[V0_0:%.*]] = load i64, i64* [[A]], align 8 ; CHECK-NEXT: [[V0_1:%.*]] = load i64, i64* [[A_1]], align 8 ; CHECK-NEXT: [[V1_0:%.*]] = load i64, i64* [[B]], align 8 @@ -48,10 +47,13 @@ ; CHECK-NEXT: [[TMP0_1:%.*]] = add i64 [[V0_1]], [[V1_1]] ; CHECK-NEXT: [[TMP1_0:%.*]] = sub i64 [[V0_0]], [[V1_0]] ; CHECK-NEXT: [[TMP1_1:%.*]] = sub i64 [[V0_1]], [[V1_1]] -; CHECK-NEXT: [[TMP2_0:%.*]] = add i64 [[TMP0_0]], [[TMP0_1]] -; CHECK-NEXT: [[TMP2_1:%.*]] = add i64 [[TMP1_0]], [[TMP1_1]] -; CHECK-NEXT: store i64 [[TMP2_0]], i64* [[C]], align 8 -; CHECK-NEXT: store i64 [[TMP2_1]], i64* [[C_1]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> undef, i64 [[TMP0_0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[TMP1_0]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP0_1]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP1_1]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64* [[C:%.*]] to <2 x i64>* +; CHECK-NEXT: store <2 x i64> [[TMP5]], <2 x i64>* [[TMP6]], align 8 ; CHECK-NEXT: ret void ; %a.0 = getelementptr i64, i64* %a, i64 0 Index: test/Transforms/SLPVectorizer/X86/crash_cmpop.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/crash_cmpop.ll +++ test/Transforms/SLPVectorizer/X86/crash_cmpop.ll @@ -12,38 +12,37 @@ ; SSE: for.body: ; SSE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; SSE-NEXT: [[ACC1_056:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD13:%.*]], [[FOR_BODY]] ] -; SSE-NEXT: [[S1_055:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[COND_I40:%.*]], [[FOR_BODY]] ] -; SSE-NEXT: [[S0_054:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[COND_I44:%.*]], [[FOR_BODY]] ] +; SSE-NEXT: [[TMP0:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP21:%.*]], [[FOR_BODY]] ] ; SSE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 [[INDVARS_IV]] -; SSE-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; SSE-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4 ; SSE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; SSE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[DEST:%.*]], i64 [[INDVARS_IV]] ; SSE-NEXT: store float [[ACC1_056]], float* [[ARRAYIDX2]], align 4 -; SSE-NEXT: [[ADD:%.*]] = fadd float [[S0_054]], [[TMP0]] -; SSE-NEXT: [[ADD3:%.*]] = fadd float [[S1_055]], [[TMP0]] -; SSE-NEXT: [[MUL:%.*]] = fmul float [[S0_054]], 0.000000e+00 -; SSE-NEXT: [[ADD4:%.*]] = fadd float [[MUL]], [[ADD3]] -; SSE-NEXT: [[MUL5:%.*]] = fmul float [[S1_055]], 0.000000e+00 -; SSE-NEXT: [[ADD6:%.*]] = fadd float [[MUL5]], [[ADD]] -; SSE-NEXT: [[CMP_I:%.*]] = fcmp olt float [[ADD6]], 1.000000e+00 -; SSE-NEXT: [[COND_I:%.*]] = select i1 [[CMP_I]], float [[ADD6]], float 1.000000e+00 -; SSE-NEXT: [[CMP_I51:%.*]] = fcmp olt float [[COND_I]], -1.000000e+00 -; SSE-NEXT: [[CMP_I49:%.*]] = fcmp olt float [[ADD4]], 1.000000e+00 -; SSE-NEXT: [[COND_I50:%.*]] = select i1 [[CMP_I49]], float [[ADD4]], float 1.000000e+00 -; SSE-NEXT: [[CMP_I47:%.*]] = fcmp olt float [[COND_I50]], -1.000000e+00 -; SSE-NEXT: [[COND_I_OP:%.*]] = fmul float [[COND_I]], 0.000000e+00 -; SSE-NEXT: [[MUL10:%.*]] = select i1 [[CMP_I51]], float -0.000000e+00, float [[COND_I_OP]] -; SSE-NEXT: [[COND_I50_OP:%.*]] = fmul float [[COND_I50]], 0.000000e+00 -; SSE-NEXT: [[MUL11:%.*]] = select i1 [[CMP_I47]], float -0.000000e+00, float [[COND_I50_OP]] -; SSE-NEXT: [[ADD13]] = fadd float [[MUL10]], [[MUL11]] +; SSE-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP0]], i32 1 +; SSE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i32 0 +; SSE-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP0]], i32 0 +; SSE-NEXT: [[TMP5:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP4]], i32 1 +; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0 +; SSE-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[TMP1]], i32 1 +; SSE-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP5]], [[TMP7]] +; SSE-NEXT: [[TMP9:%.*]] = fmul <2 x float> [[TMP0]], zeroinitializer +; SSE-NEXT: [[TMP10:%.*]] = fadd <2 x float> [[TMP9]], [[TMP8]] +; SSE-NEXT: [[TMP11:%.*]] = fcmp olt <2 x float> [[TMP10]], +; SSE-NEXT: [[TMP12:%.*]] = select <2 x i1> [[TMP11]], <2 x float> [[TMP10]], <2 x float> +; SSE-NEXT: [[TMP13:%.*]] = fcmp olt <2 x float> [[TMP12]], +; SSE-NEXT: [[TMP14:%.*]] = fmul <2 x float> [[TMP12]], zeroinitializer +; SSE-NEXT: [[TMP15:%.*]] = select <2 x i1> [[TMP13]], <2 x float> , <2 x float> [[TMP14]] +; SSE-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[TMP15]], i32 0 +; SSE-NEXT: [[TMP17:%.*]] = extractelement <2 x float> [[TMP15]], i32 1 +; SSE-NEXT: [[ADD13]] = fadd float [[TMP16]], [[TMP17]] ; SSE-NEXT: [[CMP_I45:%.*]] = fcmp olt float [[ADD13]], 1.000000e+00 ; SSE-NEXT: [[COND_I46:%.*]] = select i1 [[CMP_I45]], float [[ADD13]], float 1.000000e+00 -; SSE-NEXT: [[CMP_I43:%.*]] = fcmp olt float [[COND_I46]], -1.000000e+00 -; SSE-NEXT: [[COND_I44]] = select i1 [[CMP_I43]], float -1.000000e+00, float [[COND_I46]] -; SSE-NEXT: [[CMP_I41:%.*]] = fcmp olt float [[MUL11]], 1.000000e+00 -; SSE-NEXT: [[COND_I42:%.*]] = select i1 [[CMP_I41]], float [[MUL11]], float 1.000000e+00 -; SSE-NEXT: [[CMP_I39:%.*]] = fcmp olt float [[COND_I42]], -1.000000e+00 -; SSE-NEXT: [[COND_I40]] = select i1 [[CMP_I39]], float -1.000000e+00, float [[COND_I42]] +; SSE-NEXT: [[CMP_I41:%.*]] = fcmp olt float [[TMP17]], 1.000000e+00 +; SSE-NEXT: [[COND_I42:%.*]] = select i1 [[CMP_I41]], float [[TMP17]], float 1.000000e+00 +; SSE-NEXT: [[TMP18:%.*]] = insertelement <2 x float> undef, float [[COND_I42]], i32 0 +; SSE-NEXT: [[TMP19:%.*]] = insertelement <2 x float> [[TMP18]], float [[COND_I46]], i32 1 +; SSE-NEXT: [[TMP20:%.*]] = fcmp olt <2 x float> [[TMP19]], +; SSE-NEXT: [[TMP21]] = select <2 x i1> [[TMP20]], <2 x float> , <2 x float> [[TMP19]] ; SSE-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 32 ; SSE-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] ; SSE: for.end: Index: test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll +++ test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll @@ -15,18 +15,20 @@ ; CHECK: for.body6: ; CHECK-NEXT: br label [[FOR_BODY12:%.*]] ; CHECK: for.body12: -; CHECK-NEXT: [[FZIMG_069:%.*]] = phi double [ undef, [[FOR_BODY6]] ], [ [[ADD19:%.*]], [[IF_END:%.*]] ] -; CHECK-NEXT: [[FZREAL_068:%.*]] = phi double [ undef, [[FOR_BODY6]] ], [ [[ADD20:%.*]], [[IF_END]] ] -; CHECK-NEXT: [[MUL13:%.*]] = fmul double [[FZREAL_068]], [[FZREAL_068]] -; CHECK-NEXT: [[MUL14:%.*]] = fmul double [[FZIMG_069]], [[FZIMG_069]] -; CHECK-NEXT: [[ADD15:%.*]] = fadd double [[MUL13]], [[MUL14]] +; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x double> [ undef, [[FOR_BODY6]] ], [ [[TMP7:%.*]], [[IF_END:%.*]] ] +; CHECK-NEXT: [[TMP1:%.*]] = fmul <2 x double> [[TMP0]], [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 +; CHECK-NEXT: [[ADD15:%.*]] = fadd double [[TMP2]], [[TMP3]] ; CHECK-NEXT: [[CMP16:%.*]] = fcmp ogt double [[ADD15]], 4.000000e+00 ; CHECK-NEXT: br i1 [[CMP16]], label [[FOR_INC21:%.*]], label [[IF_END]] ; CHECK: if.end: -; CHECK-NEXT: [[MUL18:%.*]] = fmul double undef, [[FZIMG_069]] -; CHECK-NEXT: [[ADD19]] = fadd double undef, [[MUL18]] -; CHECK-NEXT: [[SUB:%.*]] = fsub double [[MUL13]], [[MUL14]] -; CHECK-NEXT: [[ADD20]] = fadd double undef, [[SUB]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP0]], i32 1 +; CHECK-NEXT: [[MUL18:%.*]] = fmul double undef, [[TMP4]] +; CHECK-NEXT: [[SUB:%.*]] = fsub double [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> undef, double [[SUB]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[MUL18]], i32 1 +; CHECK-NEXT: [[TMP7]] = fadd <2 x double> undef, [[TMP6]] ; CHECK-NEXT: br i1 undef, label [[FOR_BODY12]], label [[FOR_INC21]] ; CHECK: for.inc21: ; CHECK-NEXT: br i1 undef, label [[FOR_END23:%.*]], label [[FOR_BODY6]] Index: test/Transforms/SLPVectorizer/X86/lookahead.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/lookahead.ll +++ test/Transforms/SLPVectorizer/X86/lookahead.ll @@ -30,14 +30,18 @@ ; CHECK-NEXT: [[C_1:%.*]] = load double, double* [[IDX5]], align 8 ; CHECK-NEXT: [[D_0:%.*]] = load double, double* [[IDX6]], align 8 ; CHECK-NEXT: [[D_1:%.*]] = load double, double* [[IDX7]], align 8 -; CHECK-NEXT: [[SUBAB_0:%.*]] = fsub fast double [[A_0]], [[B_0]] ; CHECK-NEXT: [[SUBCD_0:%.*]] = fsub fast double [[C_0]], [[D_0]] ; CHECK-NEXT: [[SUBAB_1:%.*]] = fsub fast double [[A_1]], [[B_1]] -; CHECK-NEXT: [[SUBCD_1:%.*]] = fsub fast double [[C_1]], [[D_1]] -; CHECK-NEXT: [[ADDABCD_0:%.*]] = fadd fast double [[SUBAB_0]], [[SUBCD_0]] -; CHECK-NEXT: [[ADDCDAB_1:%.*]] = fadd fast double [[SUBCD_1]], [[SUBAB_1]] -; CHECK-NEXT: store double [[ADDABCD_0]], double* [[IDX0]], align 8 -; CHECK-NEXT: store double [[ADDCDAB_1]], double* [[IDX1]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> undef, double [[A_0]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[C_1]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> undef, double [[B_0]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[D_1]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> undef, double [[SUBCD_0]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[SUBAB_1]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast double* [[IDX0]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8 ; CHECK-NEXT: ret void ; entry: Index: test/Transforms/SLPVectorizer/X86/slp-throttle.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/slp-throttle.ll +++ test/Transforms/SLPVectorizer/X86/slp-throttle.ll @@ -5,18 +5,20 @@ ; CHECK-LABEL: @rftbsub( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 2 -; CHECK-NEXT: [[TMP0:%.*]] = load double, double* [[ARRAYIDX6]], align 8 -; CHECK-NEXT: [[TMP1:%.*]] = or i64 2, 1 -; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP2:%.*]] = load double, double* [[ARRAYIDX12]], align 8 -; CHECK-NEXT: [[ADD16:%.*]] = fadd double [[TMP2]], undef +; CHECK-NEXT: [[TMP0:%.*]] = or i64 2, 1 +; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[ARRAYIDX6]] to <2 x double>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 1 +; CHECK-NEXT: [[ADD16:%.*]] = fadd double [[TMP3]], undef ; CHECK-NEXT: [[MUL18:%.*]] = fmul double undef, [[ADD16]] ; CHECK-NEXT: [[ADD19:%.*]] = fadd double undef, [[MUL18]] ; CHECK-NEXT: [[SUB22:%.*]] = fsub double undef, undef -; CHECK-NEXT: [[SUB25:%.*]] = fsub double [[TMP0]], [[ADD19]] -; CHECK-NEXT: store double [[SUB25]], double* [[ARRAYIDX6]], align 8 -; CHECK-NEXT: [[SUB29:%.*]] = fsub double [[TMP2]], [[SUB22]] -; CHECK-NEXT: store double [[SUB29]], double* [[ARRAYIDX12]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> undef, double [[ADD19]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[SUB22]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[ARRAYIDX6]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP6]], <2 x double>* [[TMP7]], align 8 ; CHECK-NEXT: unreachable ; entry: