Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -28,6 +28,7 @@ #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallString.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/iterator.h" @@ -115,8 +116,17 @@ "number ")); static cl::opt -ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, - cl::desc("Attempt to vectorize horizontal reductions")); + ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, + cl::desc("Attempt to vectorize horizontal reductions")); + +static cl::opt + SLPThrottling("slp-throttling", cl::init(true), cl::Hidden, + cl::desc("Enable tree partial vectorize with throttling")); + +static cl::opt + MaxCostsRecalculations("slp-throttling-budget", cl::init(128), cl::Hidden, + cl::desc("Limit the total number of nodes for cost " + "recalculations during throttling")); static cl::opt ShouldStartVectorizeHorAtStore( "slp-vectorize-hor-store", cl::init(false), cl::Hidden, @@ -533,9 +543,29 @@ /// holding live values over call sites. int getSpillCost() const; + /// \returns the cost extracting vectorized elements. + int getExtractCost() const; + + /// \returns the cost of gathering canceled elements to be used + /// by vectorized operations during throttling. + int getInsertCost() const; + + /// Explore SLP graph non-gathering nodes structure. + void treeTraversal(); + + /// Cut given path until it might be good to vectorize. + Optional cutPath(int &Cost, ArrayRef Path); + + /// Find a non-gathering leaf node from current node C and record the path + /// on the way. + TreeEntry *findLeaf(TreeEntry *C, SmallVectorImpl &Path) const; + + /// Find a subtree of the whole tree suitable to be vectorized. + Optional findSubTree(int Cost); + /// \returns the vectorization cost of the subtree that starts at \p VL. /// A negative number means that this is profitable. - int getTreeCost(); + Optional getTreeCost(bool CutTree = false); /// Construct a vectorizable tree that starts at \p Roots, ignoring users for /// the purpose of scheduling and extraction in the \p UserIgnoreLst. @@ -556,6 +586,8 @@ ScalarToTreeEntry.clear(); MustGather.clear(); ExternalUses.clear(); + InternalTreeUses.clear(); + RemovedOperations.clear(); NumOpsWantToKeepOrder.clear(); NumOpsWantToKeepOriginalOrder = 0; for (auto &Iter : BlocksSchedules) { @@ -563,6 +595,9 @@ BS->clear(); } MinBWs.clear(); + ScalarsToVec.clear(); + VecToScalars.clear(); + CostsRecalculations = 0; } unsigned getTreeSize() const { return VectorizableTree.size(); } @@ -606,6 +641,9 @@ return MinVecRegSize; } + /// Save seed instructions to try partially vectorize later. + void recordSeeds(ArrayRef Ops); + /// Check if ArrayType or StructType is isomorphic to some VectorType. /// /// \returns number of elements in vector if isomorphism exists, 0 otherwise. @@ -615,6 +653,40 @@ /// vectorizable. We do not vectorize such trees. bool isTreeTinyAndNotFullyVectorizable() const; + /// Try to cut the tree to make it partially vectorizable. + bool cutTree(); + + /// Try partially vectorize the tree via throttling. When vectorizing the + /// whole tree is not profitable, we can consider vectorizing part of that + /// tree. SLP algorithm looks to operations to vectorize starting from seed + /// instructions on the bottom toward the end of chains of dependencies to the + /// top of SLP graph, it groups potentially vectorizable operations in + /// scalar form to bundles. + /// For example: + /// + /// scalar form + /// | + /// scalar form scalar form + /// \ / + /// scalar form + /// + /// Total cost is not profitable to vectorize, hence all operations are in + /// scalar form. + /// + /// Here is the same tree after SLP throttling transformation: + /// + /// vector form + /// | + /// vector form scalar form + /// \ / + /// vector form + /// + /// So, we can throttle some operations in such a way that it is still + /// profitable to vectorize part on the tree, while all tree vectorization + /// does not make sense. + /// More details: http://www.llvm.org/devmtg/2015-10/slides/Porpodas-ThrottlingAutomaticVectorization.pdf + bool tryPartialVectorization(); + OptimizationRemarkEmitter *getORE() { return ORE; } /// This structure holds any data we need about the edges being traversed @@ -1199,6 +1271,16 @@ /// Does this entry require reordering? ArrayRef ReorderIndices; + /// Cost of this tree entry. + int Cost = 0; + + /// True if this node has more than one non-gathering child. + bool IsBranch = false; + + /// True if this node is a gathering node or proposed to be a gathering + /// node during throttling. + bool ProposedToGather = false; + /// Points back to the VectorizableTree. /// /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has @@ -1211,6 +1293,9 @@ /// have multiple users so the data structure is not truly a tree. SmallVector UserTreeIndices; + /// The index containing the use of this entry. + SmallVector UseTreeIndices; + /// The index of this treeEntry in VectorizableTree. int Idx = -1; @@ -1346,12 +1431,12 @@ if (ReuseShuffleIndices.empty()) dbgs() << "Emtpy"; else - for (unsigned ReuseIdx : ReuseShuffleIndices) - dbgs() << ReuseIdx << ", "; + for (unsigned Idx : ReuseShuffleIndices) + dbgs() << Idx << ", "; dbgs() << "\n"; dbgs() << "ReorderIndices: "; - for (unsigned ReorderIdx : ReorderIndices) - dbgs() << ReorderIdx << ", "; + for (unsigned Idx : ReorderIndices) + dbgs() << Idx << ", "; dbgs() << "\n"; dbgs() << "UserTreeIndices: "; for (const auto &EInfo : UserTreeIndices) @@ -1396,8 +1481,10 @@ MustGather.insert(VL.begin(), VL.end()); } - if (UserTreeIdx.UserTE) + if (UserTreeIdx.UserTE) { Last->UserTreeIndices.push_back(UserTreeIdx); + VectorizableTree[UserTreeIdx.UserTE->Idx]->UseTreeIndices.push_back(Last); + } return Last; } @@ -1433,6 +1520,16 @@ /// Maps a specific scalar to its tree entry. SmallDenseMap ScalarToTreeEntry; + /// Tree entries that should not be vectorized due to throttling. + SmallVector RemovedOperations; + + /// Tree values proposed to be vectorized. + ValueSet ScalarsToVec; + + /// Tree values once considered to be vectorized, but later with throttling + /// decided to stay in a scalar form. + ValueSet VecToScalars; + /// A list of scalars that we found that we need to keep as scalars. ValueSet MustGather; @@ -1452,6 +1549,9 @@ }; using UserList = SmallVector; + /// \returns the cost of extracting the vectorized elements. + int getExtractOperationCost(const ExternalUser &EU) const; + /// Checks if two instructions may access the same memory. /// /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it @@ -1503,6 +1603,17 @@ /// after vectorization. UserList ExternalUses; + /// List of all seeds instructions, we could try to vectorize those seed + /// instructions with partial vectorization. + SmallVector, 2> Seeds; + + /// Number of times in nodes that we already recalulated cost of + /// the subtree during throtteling. + int CostsRecalculations = 0; + + /// Internal tree oprations proposed to be vectorized values use. + SmallDenseMap InternalTreeUses; + /// Values used only by @llvm.assume calls. SmallPtrSet EphValues; @@ -1889,6 +2000,9 @@ /// Attaches the BlockScheduling structures to basic blocks. MapVector> BlocksSchedules; + /// Remove operations from the list of proposed to schedule. + void removeFromScheduling(BlockScheduling *BS); + /// Performs the "real" scheduling. Done before vectorization is actually /// performed in a basic block. void scheduleBlock(BlockScheduling *BS); @@ -2110,6 +2224,7 @@ LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U << ".\n"); assert(!UseEntry->NeedToGather && "Bad state"); + InternalTreeUses[U].emplace_back(Scalar, U, FoundLane); continue; } } @@ -2762,6 +2877,105 @@ } } +bool BoUpSLP::cutTree() { + SmallPtrSet Removed; + SmallVector VecNodes; + for (auto &TEPtr : VectorizableTree) { + TreeEntry *Entry = TEPtr.get(); + if (!Entry->ProposedToGather) + VecNodes.push_back(Entry); + } + if (VecNodes.size() <= 2) + return false; + auto It = llvm::find_if(VecNodes, [](TreeEntry *E) { + Instruction *Inst = E->getMainOp(); + return (Inst && (isa(Inst) || isa(Inst) || + isa(Inst))); + }); + if (It == VecNodes.end()) + return false; + // Canceling unprofitable elements. + for (auto &TEPtr : VectorizableTree) { + TreeEntry *Entry = TEPtr.get(); + if (Entry->NeedToGather) + continue; + if (Entry->ProposedToGather) { + Entry->NeedToGather = true; + for (Value *V : Entry->Scalars) { + LLVM_DEBUG(dbgs() << "SLP: Remove scalar " << *V + << " out of proposed to vectorize.\n"); + ScalarToTreeEntry.erase(V); + Removed.insert(V); + RemovedOperations.push_back(Entry); + MustGather.insert(V); + ExternalUses.erase( + std::remove_if(ExternalUses.begin(), ExternalUses.end(), + [&V](ExternalUser &EU) { return EU.Scalar == V; }), + ExternalUses.end()); + } + } + } + // For all canceled operations we should consider the possibility of + // use by with non-canceled operations and for that, it requires + // to populate ExternalUser list with canceled elements. + for (TreeEntry *Entry : VecNodes) + for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { + Value *Scalar = Entry->Scalars[Lane]; + for (User *U : Scalar->users()) { + LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n"); + auto *UserInst = dyn_cast(U); + if (!UserInst) + continue; + if (!Removed.count(U)) + continue; + // Ignore users in the user ignore list. + if (is_contained(UserIgnoreList, UserInst)) + continue; + LLVM_DEBUG(dbgs() << "SLP: Need to extract canceled operation :" << *U + << " from lane " << Lane << " from " << *Scalar + << ".\n"); + ExternalUses.emplace_back(ExternalUser(Scalar, U, Lane)); + } + } + return true; +} + +bool BoUpSLP::tryPartialVectorization() { + bool Changed = false; + for (ArrayRef S : Seeds) { + // Check those seed instructions are still alive. + if (llvm::any_of(S, [](Value *V) { + return (!(cast(V))->getOperand(0)); + })) + continue; + + // Stop if we are over our budget of maximum cost calculations. + if (CostsRecalculations >= MaxCostsRecalculations) + break; + + buildTree(S); + + // If other part BB were vectorized the tree might not be + // enough interest to look. + if (isTreeTinyAndNotFullyVectorizable()) + continue; + + Optional Cost = getTreeCost(true); + if (Cost.hasValue() && Cost.getValue() < -SLPCostThreshold) { + LLVM_DEBUG(dbgs() << "SLP: Decided to partially vectorize with cost=" + << Cost.getValue() << "\n"); + vectorizeTree(); + Changed = true; + } + } + Seeds.clear(); + return Changed; +} + +void BoUpSLP::recordSeeds(ArrayRef Ops) { + Seeds.push_back(SmallVector(Ops.begin(), Ops.end())); +} + unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const { unsigned N; Type *EltTy; @@ -3294,7 +3508,7 @@ // Update LiveValues. LiveValues.erase(PrevInst); for (auto &J : PrevInst->operands()) { - if (isa(&*J) && getTreeEntry(&*J)) + if (isa(&*J) && ScalarsToVec.count(&*J)) LiveValues.insert(cast(&*J)); } @@ -3339,15 +3553,208 @@ return Cost; } -int BoUpSLP::getTreeCost() { - int Cost = 0; +int BoUpSLP::getExtractOperationCost(const ExternalUser &EU) const { + unsigned BundleWidth = VectorizableTree.front()->Scalars.size(); + + // Uses by ephemeral values are free (because the ephemeral value will be + // removed prior to code generation, and so the extraction will be + // removed as well). + if (EphValues.count(EU.User)) + return 0; + + // If we plan to rewrite the tree in a smaller type, we will need to sign + // extend the extracted value back to the original type. Here, we account + // for the extract and the added cost of the sign extend if needed. + auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth); + Value *ScalarRoot = VectorizableTree[0]->Scalars[0]; + + auto It = MinBWs.find(ScalarRoot); + if (It != MinBWs.end()) { + uint64_t Width = It->second.first; + bool Signed = It->second.second; + auto *MinTy = IntegerType::get(F->getContext(), Width); + unsigned ExtOp = Signed ? Instruction::SExt : Instruction::ZExt; + VecTy = VectorType::get(MinTy, BundleWidth); + return (TTI->getExtractWithExtendCost(ExtOp, EU.Scalar->getType(), VecTy, + EU.Lane)); + } + return TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane); +} + +int BoUpSLP::getExtractCost() const { + int ExtractCost = 0; + SmallPtrSet ExtractCostCalculated; + for (const ExternalUser &EU : ExternalUses) { + // We only add extract cost once for the same scalar. + if (!ExtractCostCalculated.insert(EU.Scalar).second) + continue; + + // Avoid non-vectorized scalars. + if (!ScalarsToVec.count(EU.Scalar)) { + // Consider the possibility of extracting vectorized + // values for canceled elements use. + auto It = InternalTreeUses.find(EU.Scalar); + if (It != InternalTreeUses.end()) { + const UserList &UL = It->second; + for (const ExternalUser &IU : UL) + ExtractCost += getExtractOperationCost(IU); + } + continue; + } + ExtractCost += getExtractOperationCost(EU); + } + return ExtractCost; +} + +int BoUpSLP::getInsertCost() const { + int InsertCost = 0; + for (auto &TEPtr : VectorizableTree) { + TreeEntry *Entry = TEPtr.get(); + if (Entry->NeedToGather) + continue; + for (Value *V : Entry->Scalars) { + auto *Inst = cast(V); + for (Use &U : Inst->operands()) { + Value *Op = U.get(); + if (VecToScalars.count(Op)) + InsertCost += getGatherCost(Op); + } + } + } + return InsertCost; +} + +Optional BoUpSLP::cutPath(int &Cost, ArrayRef Path) { + // Decrement nodes one by one until Path is empty or we find a suitable set + // of nodes for partial tree vectorization + for (TreeEntry *N : Path) { + CostsRecalculations++; + + // We are no longer propose to vectorize this node and we substitute + // cost of this node from the cost of all vectorizable nodes. + assert(!N->ProposedToGather && "Incorrect node state, visiting twice."); + N->ProposedToGather = true; + Cost -= N->Cost; + for (Value *V : N->Scalars) { + ScalarsToVec.erase(V); + VecToScalars.insert(V); + } + int PartialCost = Cost; + PartialCost += getExtractCost() + getSpillCost() + getInsertCost(); + if (PartialCost < -SLPCostThreshold && cutTree()) + return PartialCost; + } + return None; +} + +BoUpSLP::TreeEntry * +BoUpSLP::findLeaf(TreeEntry *C, SmallVectorImpl &Path) const { + int NonGatherUse; + if (!is_contained(Path, C)) + Path.push_back(C); + do { + NonGatherUse = 0; + for (TreeEntry *Next : llvm::reverse(C->UseTreeIndices)) { + // Ignore any processed nodes to avoid cycles. + if (Next->ProposedToGather || is_contained(Path, Next) || Next == C) + continue; + C = Next; + Path.push_back(C); + NonGatherUse++; + break; + } + } while (NonGatherUse != 0); + return C; +} + +void BoUpSLP::treeTraversal() { + // Find nodes with more than one use and it might include cycles because + // we don't know our routes yet. + for (int N = 0, E = VectorizableTree.size(); N < E; ++N) { + TreeEntry *TE = VectorizableTree[N].get(); + if (TE->NeedToGather) { + TE->ProposedToGather = true; + continue; + } + TE->IsBranch = llvm::count_if(TE->UseTreeIndices, [&N](TreeEntry *Next) { + return (Next->Idx != N && !Next->NeedToGather); + }) > 1; + } +} + +Optional BoUpSLP::findSubTree(int Cost) { + SmallVector Path; + SmallVector SubPath; + TreeEntry *Node = nullptr; + treeTraversal(); + + // To start we can find just one leaf node that happens to be not the root + // node of the graph i.e. with non-zero index. Then, Path is route from the + // root node to our leaf node. + if (!findLeaf(VectorizableTree[0].get(), Path)->Idx) + return None; + do { + Node = Path.back(); + assert(!Node->ProposedToGather && "Incorrect node state"); + // If we found a branch node i.e. node with more than one non-gathering + // child, we could try to find set of profitable nodes in SubPath to + // vectorize and if there is no such set of profitable nodes then we could + // consider another leaf that is reachable from this branch node. + if (Node->IsBranch) { + Optional PartialCost = cutPath(Cost, SubPath); + if (CostsRecalculations >= MaxCostsRecalculations) { + SubPath.clear(); + break; + } + if (PartialCost.hasValue()) + return PartialCost; + TreeEntry *NextFromBranch = nullptr; + auto It = llvm::find_if(llvm::reverse(Node->UseTreeIndices), + [&Node, &Path](TreeEntry *E) { + return (E != Node && !E->ProposedToGather && + !is_contained(Path, E)); + }); + if (It != Node->UseTreeIndices.rend()) + NextFromBranch = *It; + SubPath.clear(); + if (NextFromBranch && NextFromBranch != Node) { + Node = findLeaf(NextFromBranch, Path); + } else { + Node->IsBranch = false; + } + } else { + // If this node is not a branch node then we could move to another node + // below until we reach the root node of the graph or encounter another + // branch node. + SubPath.push_back(Node); + Path.pop_back(); + } + } while (Node->Idx); + + // We don't have any branches now and reduce single remaining path now. + if (!SubPath.empty()) { + Optional PartialCost = cutPath(Cost, SubPath); + if (PartialCost.hasValue()) + return PartialCost; + } + +#ifndef NDEBUG + // Make sure that we have processed all nodes. + if (CostsRecalculations < MaxCostsRecalculations) { + for (unsigned I = 1, E = VectorizableTree.size(); I < E; ++I) + assert(VectorizableTree[I]->ProposedToGather && "Incorrect node state"); + } +#endif + return None; +} + +Optional BoUpSLP::getTreeCost(bool CutTree) { + int CostSum = 0; LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size " << VectorizableTree.size() << ".\n"); - unsigned BundleWidth = VectorizableTree[0]->Scalars.size(); - - for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) { - TreeEntry &TE = *VectorizableTree[I].get(); + for (auto &TEPtr : VectorizableTree) { + TreeEntry &TE = *TEPtr.get(); // We create duplicate tree entries for gather sequences that have multiple // uses. However, we should not compute the cost of duplicate sequences. @@ -3362,65 +3769,66 @@ // existing heuristics based on tree size may yield different results. // if (TE.NeedToGather && - std::any_of( - std::next(VectorizableTree.begin(), I + 1), VectorizableTree.end(), - [TE](const std::unique_ptr &EntryPtr) { - return EntryPtr->NeedToGather && EntryPtr->isSame(TE.Scalars); - })) + std::any_of(std::next(VectorizableTree.begin(), TE.Idx + 1), + VectorizableTree.end(), + [TE](const std::unique_ptr &EntryPtr) { + return EntryPtr->NeedToGather && + EntryPtr->isSame(TE.Scalars); + })) continue; - int C = getEntryCost(&TE); - LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C + if (!TE.NeedToGather) { + for (Value *V : TE.Scalars) + ScalarsToVec.insert(V); + } + + TE.Cost = getEntryCost(&TE); + LLVM_DEBUG(dbgs() << "SLP: Adding cost " << TE.Cost << " for bundle that starts with " << *TE.Scalars[0] << ".\n"); - Cost += C; - } - - SmallPtrSet ExtractCostCalculated; - int ExtractCost = 0; - for (ExternalUser &EU : ExternalUses) { - // We only add extract cost once for the same scalar. - if (!ExtractCostCalculated.insert(EU.Scalar).second) - continue; - - // Uses by ephemeral values are free (because the ephemeral value will be - // removed prior to code generation, and so the extraction will be - // removed as well). - if (EphValues.count(EU.User)) - continue; - - // If we plan to rewrite the tree in a smaller type, we will need to sign - // extend the extracted value back to the original type. Here, we account - // for the extract and the added cost of the sign extend if needed. - auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth); - auto *ScalarRoot = VectorizableTree[0]->Scalars[0]; - if (MinBWs.count(ScalarRoot)) { - auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first); - auto Extend = - MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt; - VecTy = VectorType::get(MinTy, BundleWidth); - ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(), - VecTy, EU.Lane); - } else { - ExtractCost += - TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane); - } + CostSum += TE.Cost; } + int ExtractCost = getExtractCost(); int SpillCost = getSpillCost(); - Cost += SpillCost + ExtractCost; + int Cost = CostSum + ExtractCost + SpillCost; - std::string Str; - { - raw_string_ostream OS(Str); + if (CutTree) { + // Avoid changing the decision of vectorizing the whole tree while doing + // partial vectorization. We have not seen any good examples of such + // decisions. + if (Cost < -SLPCostThreshold) + return None; + for (auto &TEPtr : VectorizableTree) { + TreeEntry *TE = TEPtr.get(); + if (TE->NeedToGather) + continue; + int GatherCost = 0; + for (TreeEntry *Gather : TE->UseTreeIndices) + if (Gather->NeedToGather) + GatherCost += Gather->Cost; + TE->Cost = TE->Cost + GatherCost; + } + Optional PartialCost = findSubTree(CostSum); + if (PartialCost.hasValue()) { + SmallString<256> Str; + raw_svector_ostream OS(Str); + Cost = PartialCost.getValue(); + OS << "SLP: Partial vectorization with Total Cost = " << Cost << ".\n"; + LLVM_DEBUG(dbgs() << Str); + if (ViewSLPTree) + ViewGraph(this, "SLP" + F->getName(), false, Str); + } + } else { + SmallString<256> Str; + raw_svector_ostream OS(Str); OS << "SLP: Spill Cost = " << SpillCost << ".\n" << "SLP: Extract Cost = " << ExtractCost << ".\n" << "SLP: Total Cost = " << Cost << ".\n"; + LLVM_DEBUG(dbgs() << Str); + if (ViewSLPTree) + ViewGraph(this, "SLP" + F->getName(), false, Str); } - LLVM_DEBUG(dbgs() << Str); - - if (ViewSLPTree) - ViewGraph(this, "SLP" + F->getName(), false, Str); return Cost; } @@ -4160,7 +4568,12 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { // All blocks must be scheduled before any instructions are inserted. for (auto &BSIter : BlocksSchedules) { - scheduleBlock(BSIter.second.get()); + BlockScheduling *BS = BSIter.second.get(); + // Remove all Schedule Data from all nodes that we have changed + // vectorization decision. + if (!RemovedOperations.empty()) + removeFromScheduling(BS); + scheduleBlock(BS); } Builder.SetInsertPoint(&F->getEntryBlock().front()); @@ -4289,13 +4702,16 @@ Type *Ty = Scalar->getType(); if (!Ty->isVoidTy()) { #ifndef NDEBUG - for (User *U : Scalar->users()) { - LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n"); - - // It is legal to replace users in the ignorelist by undef. - assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) && - "Replacing out-of-tree value with undef"); - } + // The tree might not be fully vectorized, so we don't have to + // check every user. + if (RemovedOperations.empty()) + for (User *U : Scalar->users()) { + LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n"); + + // It is legal to replace users in the ignorelist by undef. + assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) && + "Replacing out-of-tree value with undef"); + } #endif Value *Undef = UndefValue::get(Ty); Scalar->replaceAllUsesWith(Undef); @@ -4773,6 +5189,31 @@ ReadyInsts.clear(); } +void BoUpSLP::removeFromScheduling(BlockScheduling *BS) { + bool Removed = false; + for (TreeEntry *Entry : RemovedOperations) { + ScheduleData *SD = BS->getScheduleData(Entry->Scalars[0]); + if (SD && SD->isPartOfBundle()) { + if (!Removed) { + Removed = true; + BS->resetSchedule(); + } + BS->cancelScheduling(Entry->Scalars, SD->OpValue); + } + } + if (!Removed) + return; + BS->resetSchedule(); + BS->initialFillReadyList(BS->ReadyInsts); + for (Instruction *I = BS->ScheduleStart; I != BS->ScheduleEnd; + I = I->getNextNode()) { + if (BS->ScheduleDataMap.find(I) == BS->ScheduleDataMap.end()) + continue; + BS->doForAllOpcodes(I, + [&](ScheduleData *SD) { SD->clearDependencies(); }); + } +} + void BoUpSLP::scheduleBlock(BlockScheduling *BS) { if (!BS->ScheduleStart) return; @@ -5241,6 +5682,12 @@ << " underlying objects.\n"); Changed |= vectorizeGEPIndices(BB, R); } + + // Partially vectorize trees after all full vectorization is done, + // otherwise, we could prevent more profitable full vectorization with + // smaller vector sizes. + if (SLPThrottling) + Changed |= R.tryPartialVectorization(); } if (Changed) { @@ -5296,19 +5743,21 @@ R.computeMinimumValueSizes(); - int Cost = R.getTreeCost(); + Optional Cost = R.getTreeCost(); + assert(Cost.hasValue() && "Incorrect cost"); - LLVM_DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF - << "\n"); - if (Cost < -SLPCostThreshold) { - LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n"); + LLVM_DEBUG(dbgs() << "SLP: Found cost=" << Cost.getValue() << " for VF=" + << VF << "\n"); + if (Cost.getValue() < -SLPCostThreshold) { + LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost.getValue() + << "\n"); using namespace ore; R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized", cast(Chain[i])) - << "Stores SLP vectorized with cost " << NV("Cost", Cost) - << " and with tree size " + << "Stores SLP vectorized with cost " + << NV("Cost", Cost.getValue()) << " and with tree size " << NV("TreeSize", R.getTreeSize())); R.vectorizeTree(); @@ -5316,6 +5765,8 @@ // Move to the next bundle. i += VF - 1; Changed = true; + } else { + R.recordSeeds(Operands); } } @@ -5527,7 +5978,9 @@ continue; R.computeMinimumValueSizes(); - int Cost = R.getTreeCost() - UserCost; + Optional TreeCost = R.getTreeCost(); + assert(TreeCost.hasValue() && "Incorrect cost"); + int Cost = TreeCost.getValue() - UserCost; CandidateFound = true; MinCost = std::min(MinCost, Cost); @@ -5544,6 +5997,8 @@ I += VF - 1; NextInst = I + 1; Changed = true; + } else { + R.recordSeeds(Ops); } } } @@ -6357,9 +6812,10 @@ V.computeMinimumValueSizes(); // Estimate cost. - int TreeCost = V.getTreeCost(); + Optional TreeCost = V.getTreeCost(); + assert(TreeCost.hasValue() && "Incorrect cost"); int ReductionCost = getReductionCost(TTI, ReducedVals[i], ReduxWidth); - int Cost = TreeCost + ReductionCost; + int Cost = TreeCost.getValue() + ReductionCost; if (Cost >= -SLPCostThreshold) { V.getORE()->emit([&]() { return OptimizationRemarkMissed( @@ -6579,9 +7035,10 @@ /// \return true if it matches. static bool findBuildAggregate(InsertValueInst *IV, SmallVectorImpl &BuildVectorOpds) { + Value *V; do { BuildVectorOpds.push_back(IV->getInsertedValueOperand()); - Value *V = IV->getAggregateOperand(); + V = IV->getAggregateOperand(); if (isa(V)) break; IV = dyn_cast(V); Index: test/Transforms/SLPVectorizer/X86/crash_cmpop.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/crash_cmpop.ll +++ test/Transforms/SLPVectorizer/X86/crash_cmpop.ll @@ -12,39 +12,44 @@ ; SSE: for.body: ; SSE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; SSE-NEXT: [[ACC1_056:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD13:%.*]], [[FOR_BODY]] ] -; SSE-NEXT: [[S1_055:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[COND_I40:%.*]], [[FOR_BODY]] ] -; SSE-NEXT: [[S0_054:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[COND_I44:%.*]], [[FOR_BODY]] ] +; SSE-NEXT: [[TMP0:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP19:%.*]], [[FOR_BODY]] ] ; SSE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 [[INDVARS_IV]] -; SSE-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; SSE-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4 ; SSE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; SSE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[DEST:%.*]], i64 [[INDVARS_IV]] ; SSE-NEXT: store float [[ACC1_056]], float* [[ARRAYIDX2]], align 4 -; SSE-NEXT: [[ADD:%.*]] = fadd float [[S0_054]], [[TMP0]] -; SSE-NEXT: [[ADD3:%.*]] = fadd float [[S1_055]], [[TMP0]] -; SSE-NEXT: [[MUL:%.*]] = fmul float [[S0_054]], 0.000000e+00 -; SSE-NEXT: [[ADD4:%.*]] = fadd float [[MUL]], [[ADD3]] -; SSE-NEXT: [[MUL5:%.*]] = fmul float [[S1_055]], 0.000000e+00 -; SSE-NEXT: [[ADD6:%.*]] = fadd float [[MUL5]], [[ADD]] -; SSE-NEXT: [[CMP_I:%.*]] = fcmp olt float [[ADD6]], 1.000000e+00 -; SSE-NEXT: [[COND_I:%.*]] = select i1 [[CMP_I]], float [[ADD6]], float 1.000000e+00 -; SSE-NEXT: [[CMP_I51:%.*]] = fcmp olt float [[COND_I]], -1.000000e+00 -; SSE-NEXT: [[CMP_I49:%.*]] = fcmp olt float [[ADD4]], 1.000000e+00 -; SSE-NEXT: [[COND_I50:%.*]] = select i1 [[CMP_I49]], float [[ADD4]], float 1.000000e+00 -; SSE-NEXT: [[CMP_I47:%.*]] = fcmp olt float [[COND_I50]], -1.000000e+00 -; SSE-NEXT: [[COND_I_OP:%.*]] = fmul float [[COND_I]], 0.000000e+00 -; SSE-NEXT: [[MUL10:%.*]] = select i1 [[CMP_I51]], float -0.000000e+00, float [[COND_I_OP]] -; SSE-NEXT: [[COND_I50_OP:%.*]] = fmul float [[COND_I50]], 0.000000e+00 -; SSE-NEXT: [[MUL11:%.*]] = select i1 [[CMP_I47]], float -0.000000e+00, float [[COND_I50_OP]] -; SSE-NEXT: [[ADD13]] = fadd float [[MUL10]], [[MUL11]] +; SSE-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP0]], i32 1 +; SSE-NEXT: [[ADD:%.*]] = fadd float [[TMP2]], [[TMP1]] +; SSE-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP0]], i32 0 +; SSE-NEXT: [[ADD3:%.*]] = fadd float [[TMP3]], [[TMP1]] +; SSE-NEXT: [[TMP4:%.*]] = fmul <2 x float> [[TMP0]], zeroinitializer +; SSE-NEXT: [[TMP5:%.*]] = insertelement <2 x float> undef, float [[ADD]], i32 0 +; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[ADD3]], i32 1 +; SSE-NEXT: [[TMP7:%.*]] = fadd <2 x float> [[TMP4]], [[TMP6]] +; SSE-NEXT: [[TMP8:%.*]] = fcmp olt <2 x float> [[TMP7]], +; SSE-NEXT: [[TMP9:%.*]] = select <2 x i1> [[TMP8]], <2 x float> [[TMP7]], <2 x float> +; SSE-NEXT: [[TMP10:%.*]] = fcmp olt <2 x float> [[TMP9]], +; SSE-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[TMP9]], i32 0 +; SSE-NEXT: [[COND_I_OP:%.*]] = fmul float [[TMP11]], 0.000000e+00 +; SSE-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[TMP9]], i32 1 +; SSE-NEXT: [[COND_I50_OP:%.*]] = fmul float [[TMP12]], 0.000000e+00 +; SSE-NEXT: [[TMP13:%.*]] = insertelement <2 x float> undef, float [[COND_I_OP]], i32 0 +; SSE-NEXT: [[TMP14:%.*]] = insertelement <2 x float> [[TMP13]], float [[COND_I50_OP]], i32 1 +; SSE-NEXT: [[TMP15:%.*]] = select <2 x i1> [[TMP10]], <2 x float> , <2 x float> [[TMP14]] +; SSE-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[TMP15]], i32 0 +; SSE-NEXT: [[TMP17:%.*]] = extractelement <2 x float> [[TMP15]], i32 1 +; SSE-NEXT: [[ADD13]] = fadd float [[TMP16]], [[TMP17]] ; SSE-NEXT: [[CMP_I45:%.*]] = fcmp olt float [[ADD13]], 1.000000e+00 ; SSE-NEXT: [[COND_I46:%.*]] = select i1 [[CMP_I45]], float [[ADD13]], float 1.000000e+00 ; SSE-NEXT: [[CMP_I43:%.*]] = fcmp olt float [[COND_I46]], -1.000000e+00 -; SSE-NEXT: [[COND_I44]] = select i1 [[CMP_I43]], float -1.000000e+00, float [[COND_I46]] -; SSE-NEXT: [[CMP_I41:%.*]] = fcmp olt float [[MUL11]], 1.000000e+00 -; SSE-NEXT: [[COND_I42:%.*]] = select i1 [[CMP_I41]], float [[MUL11]], float 1.000000e+00 +; SSE-NEXT: [[COND_I44:%.*]] = select i1 [[CMP_I43]], float -1.000000e+00, float [[COND_I46]] +; SSE-NEXT: [[CMP_I41:%.*]] = fcmp olt float [[TMP17]], 1.000000e+00 +; SSE-NEXT: [[COND_I42:%.*]] = select i1 [[CMP_I41]], float [[TMP17]], float 1.000000e+00 ; SSE-NEXT: [[CMP_I39:%.*]] = fcmp olt float [[COND_I42]], -1.000000e+00 -; SSE-NEXT: [[COND_I40]] = select i1 [[CMP_I39]], float -1.000000e+00, float [[COND_I42]] +; SSE-NEXT: [[COND_I40:%.*]] = select i1 [[CMP_I39]], float -1.000000e+00, float [[COND_I42]] ; SSE-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 32 +; SSE-NEXT: [[TMP18:%.*]] = insertelement <2 x float> undef, float [[COND_I40]], i32 0 +; SSE-NEXT: [[TMP19]] = insertelement <2 x float> [[TMP18]], float [[COND_I44]], i32 1 ; SSE-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] ; SSE: for.end: ; SSE-NEXT: ret void Index: test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll +++ test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll @@ -15,18 +15,20 @@ ; CHECK: for.body6: ; CHECK-NEXT: br label [[FOR_BODY12:%.*]] ; CHECK: for.body12: -; CHECK-NEXT: [[FZIMG_069:%.*]] = phi double [ undef, [[FOR_BODY6]] ], [ [[ADD19:%.*]], [[IF_END:%.*]] ] -; CHECK-NEXT: [[FZREAL_068:%.*]] = phi double [ undef, [[FOR_BODY6]] ], [ [[ADD20:%.*]], [[IF_END]] ] -; CHECK-NEXT: [[MUL13:%.*]] = fmul double [[FZREAL_068]], [[FZREAL_068]] -; CHECK-NEXT: [[MUL14:%.*]] = fmul double [[FZIMG_069]], [[FZIMG_069]] -; CHECK-NEXT: [[ADD15:%.*]] = fadd double [[MUL13]], [[MUL14]] +; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x double> [ undef, [[FOR_BODY6]] ], [ [[TMP7:%.*]], [[IF_END:%.*]] ] +; CHECK-NEXT: [[TMP1:%.*]] = fmul <2 x double> [[TMP0]], [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 +; CHECK-NEXT: [[ADD15:%.*]] = fadd double [[TMP2]], [[TMP3]] ; CHECK-NEXT: [[CMP16:%.*]] = fcmp ogt double [[ADD15]], 4.000000e+00 ; CHECK-NEXT: br i1 [[CMP16]], label [[FOR_INC21:%.*]], label [[IF_END]] ; CHECK: if.end: -; CHECK-NEXT: [[MUL18:%.*]] = fmul double undef, [[FZIMG_069]] -; CHECK-NEXT: [[ADD19]] = fadd double undef, [[MUL18]] -; CHECK-NEXT: [[SUB:%.*]] = fsub double [[MUL13]], [[MUL14]] -; CHECK-NEXT: [[ADD20]] = fadd double undef, [[SUB]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP0]], i32 1 +; CHECK-NEXT: [[MUL18:%.*]] = fmul double undef, [[TMP4]] +; CHECK-NEXT: [[SUB:%.*]] = fsub double [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> undef, double [[SUB]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[MUL18]], i32 1 +; CHECK-NEXT: [[TMP7]] = fadd <2 x double> undef, [[TMP6]] ; CHECK-NEXT: br i1 undef, label [[FOR_BODY12]], label [[FOR_INC21]] ; CHECK: for.inc21: ; CHECK-NEXT: br i1 undef, label [[FOR_END23:%.*]], label [[FOR_BODY6]] Index: test/Transforms/SLPVectorizer/X86/lookahead.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/lookahead.ll +++ test/Transforms/SLPVectorizer/X86/lookahead.ll @@ -35,14 +35,18 @@ ; CHECK-NEXT: [[C_1:%.*]] = load double, double* [[IDX5]], align 8 ; CHECK-NEXT: [[D_0:%.*]] = load double, double* [[IDX6]], align 8 ; CHECK-NEXT: [[D_1:%.*]] = load double, double* [[IDX7]], align 8 -; CHECK-NEXT: [[SUBAB_0:%.*]] = fsub fast double [[A_0]], [[B_0]] ; CHECK-NEXT: [[SUBCD_0:%.*]] = fsub fast double [[C_0]], [[D_0]] ; CHECK-NEXT: [[SUBAB_1:%.*]] = fsub fast double [[A_1]], [[B_1]] -; CHECK-NEXT: [[SUBCD_1:%.*]] = fsub fast double [[C_1]], [[D_1]] -; CHECK-NEXT: [[ADDABCD_0:%.*]] = fadd fast double [[SUBAB_0]], [[SUBCD_0]] -; CHECK-NEXT: [[ADDCDAB_1:%.*]] = fadd fast double [[SUBCD_1]], [[SUBAB_1]] -; CHECK-NEXT: store double [[ADDABCD_0]], double* [[IDX0]], align 8 -; CHECK-NEXT: store double [[ADDCDAB_1]], double* [[IDX1]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> undef, double [[A_0]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[C_1]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> undef, double [[B_0]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[D_1]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> undef, double [[SUBCD_0]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[SUBAB_1]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast double* [[IDX0]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8 ; CHECK-NEXT: ret void ; entry: @@ -172,14 +176,18 @@ ; CHECK-NEXT: [[C_1:%.*]] = load double, double* [[IDX5]], align 8 ; CHECK-NEXT: [[D_0:%.*]] = load double, double* [[IDX6]], align 8 ; CHECK-NEXT: [[D_1:%.*]] = load double, double* [[IDX7]], align 8 -; CHECK-NEXT: [[ADDAB_0:%.*]] = fadd fast double [[A_0]], [[B_0]] ; CHECK-NEXT: [[SUBCD_0:%.*]] = fsub fast double [[C_0]], [[D_0]] -; CHECK-NEXT: [[ADDCD_1:%.*]] = fadd fast double [[C_1]], [[D_1]] +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> undef, double [[A_0]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[C_1]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> undef, double [[B_0]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[D_1]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = fadd fast <2 x double> [[TMP1]], [[TMP3]] ; CHECK-NEXT: [[SUBAB_1:%.*]] = fsub fast double [[A_1]], [[B_1]] -; CHECK-NEXT: [[ADDABCD_0:%.*]] = fadd fast double [[ADDAB_0]], [[SUBCD_0]] -; CHECK-NEXT: [[ADDCDAB_1:%.*]] = fadd fast double [[ADDCD_1]], [[SUBAB_1]] -; CHECK-NEXT: store double [[ADDABCD_0]], double* [[IDX0]], align 8 -; CHECK-NEXT: store double [[ADDCDAB_1]], double* [[IDX1]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> undef, double [[SUBCD_0]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[SUBAB_1]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast double* [[IDX0]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8 ; CHECK-NEXT: ret void ; entry: Index: test/Transforms/SLPVectorizer/X86/slp-throttle.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/slp-throttle.ll +++ test/Transforms/SLPVectorizer/X86/slp-throttle.ll @@ -5,18 +5,20 @@ ; CHECK-LABEL: @rftbsub( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 2 -; CHECK-NEXT: [[TMP0:%.*]] = load double, double* [[ARRAYIDX6]], align 8 -; CHECK-NEXT: [[TMP1:%.*]] = or i64 2, 1 -; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP2:%.*]] = load double, double* [[ARRAYIDX12]], align 8 -; CHECK-NEXT: [[ADD16:%.*]] = fadd double [[TMP2]], undef +; CHECK-NEXT: [[TMP0:%.*]] = or i64 2, 1 +; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[ARRAYIDX6]] to <2 x double>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 1 +; CHECK-NEXT: [[ADD16:%.*]] = fadd double [[TMP3]], undef ; CHECK-NEXT: [[MUL18:%.*]] = fmul double undef, [[ADD16]] ; CHECK-NEXT: [[ADD19:%.*]] = fadd double undef, [[MUL18]] ; CHECK-NEXT: [[SUB22:%.*]] = fsub double undef, undef -; CHECK-NEXT: [[SUB25:%.*]] = fsub double [[TMP0]], [[ADD19]] -; CHECK-NEXT: store double [[SUB25]], double* [[ARRAYIDX6]], align 8 -; CHECK-NEXT: [[SUB29:%.*]] = fsub double [[TMP2]], [[SUB22]] -; CHECK-NEXT: store double [[SUB29]], double* [[ARRAYIDX12]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> undef, double [[ADD19]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[SUB22]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[ARRAYIDX6]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP6]], <2 x double>* [[TMP7]], align 8 ; CHECK-NEXT: unreachable ; entry: