diff --git a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h --- a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h +++ b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h @@ -95,8 +95,7 @@ /// Try to vectorize a list of operands. /// \returns true if a value was vectorized. - bool tryToVectorizeList(ArrayRef VL, slpvectorizer::BoUpSLP &R, - bool AllowReorder = false); + bool tryToVectorizeList(ArrayRef VL, slpvectorizer::BoUpSLP &R); /// Try to vectorize a chain that may start at the operands of \p I. bool tryToVectorize(Instruction *I, slpvectorizer::BoUpSLP &R); diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -21,6 +21,7 @@ #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/PriorityQueue.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetOperations.h" #include "llvm/ADT/SetVector.h" @@ -535,13 +536,16 @@ return true; } +static void addMask(SmallVectorImpl &Mask, ArrayRef SubMask); +static void fixupOrderingIndices(SmallVectorImpl &Order); + namespace llvm { static void inversePermutation(ArrayRef Indices, SmallVectorImpl &Mask) { Mask.clear(); const unsigned E = Indices.size(); - Mask.resize(E, E + 1); + Mask.resize(E, UndefMaskElem); for (unsigned I = 0; I < E; ++I) Mask[Indices[I]] = I; } @@ -581,6 +585,22 @@ return Index; } +/// Reorders the list of scalars in accordance with the given \p Order and then +/// the \p Mask. \p Order - is the original order of the scalars, need to +/// reorder scalars into an unordered state at first according to the given +/// order. Then the ordered scalars are shuffled once again in accordance with +/// the provided mask. +static void reorderScalars(SmallVectorImpl &Scalars, + ArrayRef Mask) { + assert(!Mask.empty() && "Expected non-empty mask."); + SmallVector Prev(Scalars.size(), + UndefValue::get(Scalars.front()->getType())); + Prev.swap(Scalars); + for (unsigned I = 0, E = Prev.size(); I < E; ++I) + if (Mask[I] != UndefMaskElem) + Scalars[Mask[I]] = Prev[I]; +} + namespace slpvectorizer { /// Bottom Up SLP Vectorizer. @@ -645,13 +665,12 @@ void buildTree(ArrayRef Roots, ArrayRef UserIgnoreLst = None); - /// Construct a vectorizable tree that starts at \p Roots, ignoring users for - /// the purpose of scheduling and extraction in the \p UserIgnoreLst taking - /// into account (and updating it, if required) list of externally used - /// values stored in \p ExternallyUsedValues. - void buildTree(ArrayRef Roots, - ExtraValueToDebugLocsMap &ExternallyUsedValues, - ArrayRef UserIgnoreLst = None); + /// Builds external uses of the vectorized scalars, i.e. the list of + /// vectorized scalars to be extracted, their lanes and their scalar users. \p + /// ExternallyUsedValues contains additional list of external uses to handle + /// vectorization of reductions. + void + buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {}); /// Clear the internal data structures that are created by 'buildTree'. void deleteTree() { @@ -659,8 +678,6 @@ ScalarToTreeEntry.clear(); MustGather.clear(); ExternalUses.clear(); - NumOpsWantToKeepOrder.clear(); - NumOpsWantToKeepOriginalOrder = 0; for (auto &Iter : BlocksSchedules) { BlockScheduling *BS = Iter.second.get(); BS->clear(); @@ -674,103 +691,22 @@ /// Perform LICM and CSE on the newly generated gather sequences. void optimizeGatherSequence(); - /// \returns The best order of instructions for vectorization. - Optional> bestOrder() const { - assert(llvm::all_of( - NumOpsWantToKeepOrder, - [this](const decltype(NumOpsWantToKeepOrder)::value_type &D) { - return D.getFirst().size() == - VectorizableTree[0]->Scalars.size(); - }) && - "All orders must have the same size as number of instructions in " - "tree node."); - auto I = std::max_element( - NumOpsWantToKeepOrder.begin(), NumOpsWantToKeepOrder.end(), - [](const decltype(NumOpsWantToKeepOrder)::value_type &D1, - const decltype(NumOpsWantToKeepOrder)::value_type &D2) { - return D1.second < D2.second; - }); - if (I == NumOpsWantToKeepOrder.end() || - I->getSecond() <= NumOpsWantToKeepOriginalOrder) - return None; - - return makeArrayRef(I->getFirst()); - } - - /// Builds the correct order for root instructions. - /// If some leaves have the same instructions to be vectorized, we may - /// incorrectly evaluate the best order for the root node (it is built for the - /// vector of instructions without repeated instructions and, thus, has less - /// elements than the root node). This function builds the correct order for - /// the root node. - /// For example, if the root node is \, then the leaves - /// are \ and \. When we try to vectorize the first - /// leaf, it will be shrink to \. If instructions in this leaf should - /// be reordered, the best order will be \<1, 0\>. We need to extend this - /// order for the root node. For the root node this order should look like - /// \<3, 0, 1, 2\>. This function extends the order for the reused - /// instructions. - void findRootOrder(OrdersType &Order) { - // If the leaf has the same number of instructions to vectorize as the root - // - order must be set already. - unsigned RootSize = VectorizableTree[0]->Scalars.size(); - if (Order.size() == RootSize) - return; - SmallVector RealOrder(Order.size()); - std::swap(Order, RealOrder); - SmallVector Mask; - inversePermutation(RealOrder, Mask); - Order.assign(Mask.begin(), Mask.end()); - // The leaf has less number of instructions - need to find the true order of - // the root. - // Scan the nodes starting from the leaf back to the root. - const TreeEntry *PNode = VectorizableTree.back().get(); - SmallVector Nodes(1, PNode); - SmallPtrSet Visited; - while (!Nodes.empty() && Order.size() != RootSize) { - const TreeEntry *PNode = Nodes.pop_back_val(); - if (!Visited.insert(PNode).second) - continue; - const TreeEntry &Node = *PNode; - for (const EdgeInfo &EI : Node.UserTreeIndices) - if (EI.UserTE) - Nodes.push_back(EI.UserTE); - if (Node.ReuseShuffleIndices.empty()) - continue; - // Build the order for the parent node. - OrdersType NewOrder(Node.ReuseShuffleIndices.size(), RootSize); - SmallVector OrderCounter(Order.size(), 0); - // The algorithm of the order extension is: - // 1. Calculate the number of the same instructions for the order. - // 2. Calculate the index of the new order: total number of instructions - // with order less than the order of the current instruction + reuse - // number of the current instruction. - // 3. The new order is just the index of the instruction in the original - // vector of the instructions. - for (unsigned I : Node.ReuseShuffleIndices) - ++OrderCounter[Order[I]]; - SmallVector CurrentCounter(Order.size(), 0); - for (unsigned I = 0, E = Node.ReuseShuffleIndices.size(); I < E; ++I) { - unsigned ReusedIdx = Node.ReuseShuffleIndices[I]; - unsigned OrderIdx = Order[ReusedIdx]; - unsigned NewIdx = 0; - for (unsigned J = 0; J < OrderIdx; ++J) - NewIdx += OrderCounter[J]; - NewIdx += CurrentCounter[OrderIdx]; - ++CurrentCounter[OrderIdx]; - assert(NewOrder[NewIdx] == RootSize && - "The order index should not be written already."); - NewOrder[NewIdx] = I; - } - std::swap(Order, NewOrder); - } - assert(Order.size() == RootSize && - "Root node is expected or the size of the order must be the same as " - "the number of elements in the root node."); - assert(llvm::all_of(Order, - [RootSize](unsigned Val) { return Val != RootSize; }) && - "All indices must be initialized"); - } + /// Reorders the current graph to the most profitable order starting from the + /// root node to the leaf nodes. The best order is chosen only from the nodes + /// of the same size (vectorization factor). Smaller nodes are considered + /// parts of subgraph with smaller VF and they are reordered independently. We + /// can make it because we still need to extend smaller nodes to the wider VF + /// and we can merge reordering shuffles with the widening shuffles. + void reorderTopToBottom(); + + /// Reorders the current graph to the most profitable order starting from + /// leaves to the root. It allows to rotate small subgraphs and reduce the + /// number of reshuffles if the leaf nodes use the same order. In this case we + /// can merge the orders and just shuffle user node instead of shuffling its + /// operands. Plus, even the leaf nodes have different orders, it allows to + /// sink reordering in the graph closer to the root node and merge it later + /// during analysis. + void reorderBottomToTop(); /// \return The vector element size in bits to use when vectorizing the /// expression tree ending at \p V. If V is a store, the size is the width of @@ -793,6 +729,10 @@ return MinVecRegSize; } + unsigned getMinVF(unsigned Sz) const { + return std::max(2U, getMinVecRegSize() / Sz); + } + unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const { unsigned MaxVF = MaxVFOption.getNumOccurrences() ? MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode); @@ -1621,6 +1561,22 @@ /// \returns true if the scalars in VL are equal to this entry. bool isSame(ArrayRef VL) const { + if (!ReorderIndices.empty()) { + // TODO: implement matching if the nodes are just reordered, still can + // treat the vector as the same if the list of scalars matches VL + // directly, without reordering. + SmallVector Mask; + inversePermutation(ReorderIndices, Mask); + if (VL.size() == Scalars.size()) + return std::equal( + VL.begin(), VL.end(), Mask.begin(), + [this](Value *V, int Idx) { return V == Scalars[Idx]; }); + ::addMask(Mask, ReuseShuffleIndices); + return VL.size() == Mask.size() && + std::equal( + VL.begin(), VL.end(), Mask.begin(), + [this](Value *V, int Idx) { return V == Scalars[Idx]; }); + } if (VL.size() == Scalars.size()) return std::equal(VL.begin(), VL.end(), Scalars.begin()); return VL.size() == ReuseShuffleIndices.size() && @@ -1701,6 +1657,12 @@ } } + /// Reorders operands of the node to the given mask \p Mask. + void reorderOperands(ArrayRef Mask) { + for (ValueList &Operand : Operands) + reorderScalars(Operand, Mask); + } + /// \returns the \p OpIdx operand of this TreeEntry. ValueList &getOperand(unsigned OpIdx) { assert(OpIdx < Operands.size() && "Off bounds"); @@ -1760,19 +1722,14 @@ return AltOp ? AltOp->getOpcode() : 0; } - /// Update operations state of this entry if reorder occurred. - bool updateStateIfReorder() { - if (ReorderIndices.empty()) - return false; - InstructionsState S = getSameOpcode(Scalars, ReorderIndices.front()); - setOperations(S); - return true; - } - /// When ReuseShuffleIndices is empty it just returns position of \p V - /// within vector of Scalars. Otherwise, try to remap on its reuse index. + /// When ReuseReorderShuffleIndices is empty it just returns position of \p + /// V within vector of Scalars. Otherwise, try to remap on its reuse index. int findLaneForValue(Value *V) const { unsigned FoundLane = std::distance(Scalars.begin(), find(Scalars, V)); assert(FoundLane < Scalars.size() && "Couldn't find extract lane"); + if (!ReorderIndices.empty()) + FoundLane = ReorderIndices[FoundLane]; + assert(FoundLane < Scalars.size() && "Couldn't find extract lane"); if (!ReuseShuffleIndices.empty()) { FoundLane = std::distance(ReuseShuffleIndices.begin(), find(ReuseShuffleIndices, FoundLane)); @@ -1856,7 +1813,7 @@ TreeEntry *newTreeEntry(ArrayRef VL, Optional Bundle, const InstructionsState &S, const EdgeInfo &UserTreeIdx, - ArrayRef ReuseShuffleIndices = None, + ArrayRef ReuseShuffleIndices = None, ArrayRef ReorderIndices = None) { TreeEntry::EntryState EntryState = Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather; @@ -1869,7 +1826,7 @@ Optional Bundle, const InstructionsState &S, const EdgeInfo &UserTreeIdx, - ArrayRef ReuseShuffleIndices = None, + ArrayRef ReuseShuffleIndices = None, ArrayRef ReorderIndices = None) { assert(((!Bundle && EntryState == TreeEntry::NeedToGather) || (Bundle && EntryState != TreeEntry::NeedToGather)) && @@ -1877,12 +1834,25 @@ VectorizableTree.push_back(std::make_unique(VectorizableTree)); TreeEntry *Last = VectorizableTree.back().get(); Last->Idx = VectorizableTree.size() - 1; - Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end()); Last->State = EntryState; Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(), ReuseShuffleIndices.end()); - Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end()); - Last->setOperations(S); + if (ReorderIndices.empty()) { + Last->Scalars.assign(VL.begin(), VL.end()); + Last->setOperations(S); + } else { + // Reorder scalars and build final mask. + Last->Scalars.assign(VL.size(), nullptr); + transform(ReorderIndices, Last->Scalars.begin(), + [VL](unsigned Idx) -> Value * { + if (Idx >= VL.size()) + return UndefValue::get(VL.front()->getType()); + return VL[Idx]; + }); + InstructionsState S = getSameOpcode(Last->Scalars); + Last->setOperations(S); + Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end()); + } if (Last->State != TreeEntry::NeedToGather) { for (Value *V : VL) { assert(!getTreeEntry(V) && "Scalar already in tree!"); @@ -2434,14 +2404,6 @@ } }; - /// Contains orders of operations along with the number of bundles that have - /// operations in this order. It stores only those orders that require - /// reordering, if reordering is not required it is counted using \a - /// NumOpsWantToKeepOriginalOrder. - DenseMap NumOpsWantToKeepOrder; - /// Number of bundles that do not require reordering. - unsigned NumOpsWantToKeepOriginalOrder = 0; - // Analysis and block reference. Function *F; ScalarEvolution *SE; @@ -2594,21 +2556,438 @@ }; } -void BoUpSLP::buildTree(ArrayRef Roots, - ArrayRef UserIgnoreLst) { - ExtraValueToDebugLocsMap ExternallyUsedValues; - buildTree(Roots, ExternallyUsedValues, UserIgnoreLst); +/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses +/// contains original mask for the scalars reused in the node. Procedure +/// transform this mask in accordance with the given \p Mask. +static void reorderReuses(SmallVectorImpl &Reuses, ArrayRef Mask) { + assert(!Mask.empty() && Reuses.size() == Mask.size() && + "Expected non-empty mask."); + SmallVector Prev(Reuses.begin(), Reuses.end()); + Prev.swap(Reuses); + for (unsigned I = 0, E = Prev.size(); I < E; ++I) + if (Mask[I] != UndefMaskElem) + Reuses[Mask[I]] = Prev[I]; } -void BoUpSLP::buildTree(ArrayRef Roots, - ExtraValueToDebugLocsMap &ExternallyUsedValues, - ArrayRef UserIgnoreLst) { - deleteTree(); - UserIgnoreList = UserIgnoreLst; - if (!allSameType(Roots)) +/// Reorders the given \p Order according to the given \p Mask. \p Order - is +/// the original order of the scalars. Procedure transforms the provided order +/// in accordance with the given \p Mask. If the resulting \p Order is just an +/// identity order, \p Order is cleared. +static void reorderOrder(SmallVectorImpl &Order, ArrayRef Mask) { + assert(!Mask.empty() && "Expected non-empty mask."); + SmallVector MaskOrder; + if (Order.empty()) { + MaskOrder.resize(Mask.size()); + std::iota(MaskOrder.begin(), MaskOrder.end(), 0); + } else { + inversePermutation(Order, MaskOrder); + } + reorderReuses(MaskOrder, Mask); + if (ShuffleVectorInst::isIdentityMask(MaskOrder)) { + Order.clear(); return; - buildTree_rec(Roots, 0, EdgeInfo()); + } + Order.assign(Mask.size(), Mask.size()); + for (unsigned I = 0, E = Mask.size(); I < E; ++I) + if (MaskOrder[I] != UndefMaskElem) + Order[MaskOrder[I]] = I; + fixupOrderingIndices(Order); +} + +void BoUpSLP::reorderTopToBottom() { + // Maps VF to the graph nodes. + DenseMap> VFToOrderedEntries; + // ExtractElement gather nodes which can be vectorized and need to handle + // their ordering. + DenseMap GathersToOrders; + // Find all reorderable nodes with the given VF. + // Currently the are vectorized loads,extracts + some gathering of extracts. + for_each(VectorizableTree, [this, &VFToOrderedEntries, &GathersToOrders]( + const std::unique_ptr &TE) { + // No need to reorder if need to shuffle reuses, still need to shuffle the + // node. + if (!TE->ReuseShuffleIndices.empty()) + return; + if (TE->State == TreeEntry::Vectorize && + isa(TE->getMainOp()) && + !TE->isAltShuffle()) { + VFToOrderedEntries[TE->Scalars.size()].insert(TE.get()); + } else if (TE->State == TreeEntry::NeedToGather && + TE->getOpcode() == Instruction::ExtractElement && + !TE->isAltShuffle() && + isa(cast(TE->getMainOp()) + ->getVectorOperandType()) && + allSameType(TE->Scalars) && allSameBlock(TE->Scalars)) { + // Check that gather of extractelements can be represented as + // just a shuffle of a single vector. + OrdersType CurrentOrder; + bool Reuse = canReuseExtract(TE->Scalars, TE->getMainOp(), CurrentOrder); + if (Reuse || !CurrentOrder.empty()) { + VFToOrderedEntries[TE->Scalars.size()].insert(TE.get()); + GathersToOrders.try_emplace(TE.get(), CurrentOrder); + } + } + }); + + // Reorder the graph nodes according to their vectorization factor. + for (unsigned VF = VectorizableTree.front()->Scalars.size(); VF > 1; + VF /= 2) { + auto It = VFToOrderedEntries.find(VF); + if (It == VFToOrderedEntries.end()) + continue; + // Try to find the most profitable order. We just are looking for the most + // used order and reorder scalar elements in the nodes according to this + // mostly used order. + const SmallPtrSetImpl &OrderedEntries = It->getSecond(); + // All operands are reordered and used only in this node - propagate the + // most used order to the user node. + DenseMap OrdersUses; + SmallPtrSet VisitedOps; + for (const TreeEntry *OpTE : OrderedEntries) { + // No need to reorder this nodes, still need to extend and to use shuffle, + // just need to merge reordering shuffle and the reuse shuffle. + if (!OpTE->ReuseShuffleIndices.empty()) + continue; + // Count number of orders uses. + const auto &Order = [OpTE, &GathersToOrders]() -> const OrdersType & { + if (OpTE->State == TreeEntry::NeedToGather) + return GathersToOrders.find(OpTE)->second; + return OpTE->ReorderIndices; + }(); + // Stores actually store the mask, not the order, need to invert. + if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() && + OpTE->getOpcode() == Instruction::Store && !Order.empty()) { + SmallVector Mask; + inversePermutation(Order, Mask); + unsigned E = Order.size(); + OrdersType CurrentOrder(E, E); + transform(Mask, CurrentOrder.begin(), [E](int Idx) { + return Idx == UndefMaskElem ? E : static_cast(Idx); + }); + fixupOrderingIndices(CurrentOrder); + ++OrdersUses.try_emplace(CurrentOrder).first->getSecond(); + } else { + ++OrdersUses.try_emplace(Order).first->getSecond(); + } + } + // Set order of the user node. + if (OrdersUses.empty()) + continue; + // Choose the most used order. + ArrayRef BestOrder = OrdersUses.begin()->first; + unsigned Cnt = OrdersUses.begin()->second; + for (const auto &Pair : llvm::drop_begin(OrdersUses)) { + if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) { + BestOrder = Pair.first; + Cnt = Pair.second; + } + } + // Set order of the user node. + if (BestOrder.empty()) + continue; + SmallVector Mask; + inversePermutation(BestOrder, Mask); + SmallVector MaskOrder(BestOrder.size(), UndefMaskElem); + unsigned E = BestOrder.size(); + transform(BestOrder, MaskOrder.begin(), [E](unsigned I) { + return I < E ? static_cast(I) : UndefMaskElem; + }); + // Do an actual reordering, if profitable. + for (std::unique_ptr &TE : VectorizableTree) { + // Just do the reordering for the nodes with the given VF. + if (TE->Scalars.size() != VF) { + if (TE->ReuseShuffleIndices.size() == VF) { + // Need to reorder the reuses masks of the operands with smaller VF to + // be able to find the match between the graph nodes and scalar + // operands of the given node during vectorization/cost estimation. + assert(all_of(TE->UserTreeIndices, + [VF, &TE](const EdgeInfo &EI) { + return EI.UserTE->Scalars.size() == VF || + EI.UserTE->Scalars.size() == + TE->Scalars.size(); + }) && + "All users must be of VF size."); + // Update ordering of the operands with the smaller VF than the given + // one. + reorderReuses(TE->ReuseShuffleIndices, Mask); + } + continue; + } + if (TE->State == TreeEntry::Vectorize && + isa(TE->getMainOp()) && + !TE->isAltShuffle()) { + // Build correct orders for extract{element,value}, loads and + // stores. + reorderOrder(TE->ReorderIndices, Mask); + if (isa(TE->getMainOp())) + TE->reorderOperands(Mask); + } else { + // Reorder the node and its operands. + TE->reorderOperands(Mask); + assert(TE->ReorderIndices.empty() && + "Expected empty reorder sequence."); + reorderScalars(TE->Scalars, Mask); + } + if (!TE->ReuseShuffleIndices.empty()) { + // Apply reversed order to keep the original ordering of the reused + // elements to avoid extra reorder indices shuffling. + OrdersType CurrentOrder; + reorderOrder(CurrentOrder, MaskOrder); + SmallVector NewReuses; + inversePermutation(CurrentOrder, NewReuses); + addMask(NewReuses, TE->ReuseShuffleIndices); + TE->ReuseShuffleIndices.swap(NewReuses); + } + } + } +} + +void BoUpSLP::reorderBottomToTop() { + SetVector OrderedEntries; + DenseMap GathersToOrders; + // Find all reorderable leaf nodes with the given VF. + // Currently the are vectorized loads,extracts without alternate operands + + // some gathering of extracts. + SmallVector NonVectorized; + for_each(VectorizableTree, [this, &OrderedEntries, &GathersToOrders, + &NonVectorized]( + const std::unique_ptr &TE) { + // No need to reorder if need to shuffle reuses, still need to shuffle the + // node. + if (!TE->ReuseShuffleIndices.empty()) + return; + if (TE->State == TreeEntry::Vectorize && + isa(TE->getMainOp()) && + !TE->isAltShuffle()) { + OrderedEntries.insert(TE.get()); + } else if (TE->State == TreeEntry::NeedToGather && + TE->getOpcode() == Instruction::ExtractElement && + !TE->isAltShuffle() && + isa(cast(TE->getMainOp()) + ->getVectorOperandType()) && + allSameType(TE->Scalars) && allSameBlock(TE->Scalars)) { + // Check that gather of extractelements can be represented as + // just a shuffle of a single vector with a single user only. + OrdersType CurrentOrder; + bool Reuse = canReuseExtract(TE->Scalars, TE->getMainOp(), CurrentOrder); + if ((Reuse || !CurrentOrder.empty()) && + !any_of( + VectorizableTree, [&TE](const std::unique_ptr &Entry) { + return Entry->State == TreeEntry::NeedToGather && + Entry.get() != TE.get() && Entry->isSame(TE->Scalars); + })) { + OrderedEntries.insert(TE.get()); + GathersToOrders.try_emplace(TE.get(), CurrentOrder); + } + } + if (TE->State != TreeEntry::Vectorize) + NonVectorized.push_back(TE.get()); + }); + + // Checks if the operands of the users are reordarable and have only single + // use. + auto &&CheckOperands = + [this, &NonVectorized](const auto &Data, + SmallVectorImpl &GatherOps) { + for (unsigned I = 0, E = Data.first->getNumOperands(); I < E; ++I) { + if (any_of(Data.second, + [I](const std::pair &OpData) { + return OpData.first == I && + OpData.second->State == TreeEntry::Vectorize; + })) + continue; + ArrayRef VL = Data.first->getOperand(I); + const TreeEntry *TE = nullptr; + const auto *It = find_if(VL, [this, &TE](Value *V) { + TE = getTreeEntry(V); + return TE; + }); + if (It != VL.end() && TE->isSame(VL)) + return false; + TreeEntry *Gather = nullptr; + if (count_if(NonVectorized, [VL, &Gather](TreeEntry *TE) { + assert(TE->State != TreeEntry::Vectorize && + "Only non-vectorized nodes are expected."); + if (TE->isSame(VL)) { + Gather = TE; + return true; + } + return false; + }) > 1) + return false; + if (Gather) + GatherOps.push_back(Gather); + } + return true; + }; + // 1. Propagate order to the graph nodes, which use only reordered nodes. + // I.e., if the node has operands, that are reordered, try to make at least + // one operand order in the natural order and reorder others + reorder the + // user node itself. + SmallPtrSet Visited; + while (!OrderedEntries.empty()) { + // 1. Filter out only reordered nodes. + // 2. If the entry has multiple uses - skip it and jump to the next node. + MapVector>> Users; + SmallVector Filtered; + for (TreeEntry *TE : OrderedEntries) { + if (!(TE->State == TreeEntry::Vectorize || + (TE->State == TreeEntry::NeedToGather && + TE->getOpcode() == Instruction::ExtractElement)) || + TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() || + !all_of(drop_begin(TE->UserTreeIndices), + [TE](const EdgeInfo &EI) { + return EI.UserTE == TE->UserTreeIndices.front().UserTE; + }) || + !Visited.insert(TE).second) { + Filtered.push_back(TE); + continue; + } + // Build a map between user nodes and their operands order to speedup + // search. The graph currently does not provide this dependency directly. + for (EdgeInfo &EI : TE->UserTreeIndices) { + TreeEntry *UserTE = EI.UserTE; + auto It = Users.find(UserTE); + if (It == Users.end()) + It = Users.insert({UserTE, {}}).first; + It->second.emplace_back(EI.EdgeIdx, TE); + } + } + // Erase filtered entries. + for_each(Filtered, + [&OrderedEntries](TreeEntry *TE) { OrderedEntries.remove(TE); }); + for (const auto &Data : Users) { + // Check that operands are used only in the User node. + SmallVector GatherOps; + if (!CheckOperands(Data, GatherOps)) { + for_each(Data.second, + [&OrderedEntries](const std::pair &Op) { + OrderedEntries.remove(Op.second); + }); + continue; + } + // All operands are reordered and used only in this node - propagate the + // most used order to the user node. + DenseMap OrdersUses; + SmallPtrSet VisitedOps; + for (const auto &Op : Data.second) { + TreeEntry *OpTE = Op.second; + if (!OpTE->ReuseShuffleIndices.empty()) + continue; + const auto &Order = [OpTE, &GathersToOrders]() -> const OrdersType & { + if (OpTE->State == TreeEntry::NeedToGather) + return GathersToOrders.find(OpTE)->second; + return OpTE->ReorderIndices; + }(); + // Stores actually store the mask, not the order, need to invert. + if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() && + OpTE->getOpcode() == Instruction::Store && !Order.empty()) { + SmallVector Mask; + inversePermutation(Order, Mask); + unsigned E = Order.size(); + OrdersType CurrentOrder(E, E); + transform(Mask, CurrentOrder.begin(), [E](int Idx) { + return Idx == UndefMaskElem ? E : static_cast(Idx); + }); + fixupOrderingIndices(CurrentOrder); + ++OrdersUses.try_emplace(CurrentOrder).first->getSecond(); + } else { + ++OrdersUses.try_emplace(Order).first->getSecond(); + } + if (VisitedOps.insert(OpTE).second) + OrdersUses.try_emplace({}, 0).first->getSecond() += + OpTE->UserTreeIndices.size(); + --OrdersUses[{}]; + } + // If no orders - skip current nodes and jump to the next one, if any. + if (OrdersUses.empty()) { + for_each(Data.second, + [&OrderedEntries](const std::pair &Op) { + OrderedEntries.remove(Op.second); + }); + continue; + } + // Choose the best order. + ArrayRef BestOrder = OrdersUses.begin()->first; + unsigned Cnt = OrdersUses.begin()->second; + for (const auto &Pair : llvm::drop_begin(OrdersUses)) { + if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) { + BestOrder = Pair.first; + Cnt = Pair.second; + } + } + // Set order of the user node (reordering of operands and user nodes). + if (BestOrder.empty()) { + for_each(Data.second, + [&OrderedEntries](const std::pair &Op) { + OrderedEntries.remove(Op.second); + }); + continue; + } + // Erase operands from OrderedEntries list and adjust their orders. + VisitedOps.clear(); + SmallVector Mask; + inversePermutation(BestOrder, Mask); + SmallVector MaskOrder(BestOrder.size(), UndefMaskElem); + unsigned E = BestOrder.size(); + transform(BestOrder, MaskOrder.begin(), [E](unsigned I) { + return I < E ? static_cast(I) : UndefMaskElem; + }); + for (const std::pair &Op : Data.second) { + TreeEntry *TE = Op.second; + OrderedEntries.remove(TE); + if (!VisitedOps.insert(TE).second) + continue; + if (!TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty()) { + // Just reorder reuses indices. + reorderReuses(TE->ReuseShuffleIndices, Mask); + continue; + } + // Gathers are processed separately. + if (TE->State != TreeEntry::Vectorize) + continue; + assert((BestOrder.size() == TE->ReorderIndices.size() || + TE->ReorderIndices.empty()) && + "Non-matching sizes of user/operand entries."); + reorderOrder(TE->ReorderIndices, Mask); + } + // For gathers just need to reorder its scalars. + for (TreeEntry *Gather : GatherOps) { + if (!Gather->ReuseShuffleIndices.empty()) + continue; + assert(Gather->ReorderIndices.empty() && + "Unexpected reordering of gathers."); + reorderScalars(Gather->Scalars, Mask); + OrderedEntries.remove(Gather); + } + // Reorder operands of the user node and set the ordering for the user + // node itself. + if (Data.first->State != TreeEntry::Vectorize || + !isa( + Data.first->getMainOp()) || + Data.first->isAltShuffle()) + Data.first->reorderOperands(Mask); + if (!isa(Data.first->getMainOp()) || + Data.first->isAltShuffle()) { + reorderScalars(Data.first->Scalars, Mask); + reorderOrder(Data.first->ReorderIndices, MaskOrder); + if (Data.first->ReuseShuffleIndices.empty() && + !Data.first->ReorderIndices.empty()) { + // Insert user node to the list to try to sink reordering deeper in + // the graph. + OrderedEntries.insert(Data.first); + } + } else { + reorderOrder(Data.first->ReorderIndices, Mask); + } + } + } +} +void BoUpSLP::buildExternalUses( + const ExtraValueToDebugLocsMap &ExternallyUsedValues) { // Collect the values that we need to extract from the tree. for (auto &TEPtr : VectorizableTree) { TreeEntry *Entry = TEPtr.get(); @@ -2667,6 +3046,116 @@ } } +void BoUpSLP::buildTree(ArrayRef Roots, + ArrayRef UserIgnoreLst) { + deleteTree(); + UserIgnoreList = UserIgnoreLst; + if (!allSameType(Roots)) + return; + buildTree_rec(Roots, 0, EdgeInfo()); +} + +namespace { +/// Tracks the state we can represent the loads in the given sequence. +enum class LoadsState { Gather, Vectorize, ScatterVectorize }; +} // anonymous namespace + +/// Checks if the given array of loads can be represented as a vectorized, +/// scatter or just simple gather. +static LoadsState canVectorizeLoads(ArrayRef VL, const Value *VL0, + const TargetTransformInfo &TTI, + const DataLayout &DL, ScalarEvolution &SE, + SmallVectorImpl &Order, + SmallVectorImpl &PointerOps) { + // Check that a vectorized load would load the same memory as a scalar + // load. For example, we don't want to vectorize loads that are smaller + // than 8-bit. Even though we have a packed struct {} LLVM + // treats loading/storing it as an i8 struct. If we vectorize loads/stores + // from such a struct, we read/write packed bits disagreeing with the + // unvectorized version. + Type *ScalarTy = VL0->getType(); + + if (DL.getTypeSizeInBits(ScalarTy) != DL.getTypeAllocSizeInBits(ScalarTy)) + return LoadsState::Gather; + + // Make sure all loads in the bundle are simple - we can't vectorize + // atomic or volatile loads. + PointerOps.clear(); + PointerOps.resize(VL.size()); + auto *POIter = PointerOps.begin(); + for (Value *V : VL) { + auto *L = cast(V); + if (!L->isSimple()) + return LoadsState::Gather; + *POIter = L->getPointerOperand(); + ++POIter; + } + + Order.clear(); + // Check the order of pointer operands. + if (llvm::sortPtrAccesses(PointerOps, ScalarTy, DL, SE, Order)) { + Value *Ptr0; + Value *PtrN; + if (Order.empty()) { + Ptr0 = PointerOps.front(); + PtrN = PointerOps.back(); + } else { + Ptr0 = PointerOps[Order.front()]; + PtrN = PointerOps[Order.back()]; + } + Optional Diff = + getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE); + // Check that the sorted loads are consecutive. + if (static_cast(*Diff) == VL.size() - 1) + return LoadsState::Vectorize; + Align CommonAlignment = cast(VL0)->getAlign(); + for (Value *V : VL) + CommonAlignment = + commonAlignment(CommonAlignment, cast(V)->getAlign()); + if (TTI.isLegalMaskedGather(FixedVectorType::get(ScalarTy, VL.size()), + CommonAlignment)) + return LoadsState::ScatterVectorize; + } + + return LoadsState::Gather; +} + +/// Order may have elements assigned special value (size) which is out of +/// bounds. Such indices only appear on places which correspond to undef values +/// (see canReuseExtract for details) and used in order to avoid undef values +/// have effect on operands ordering. +/// The first loop below simply finds all unused indices and then the next loop +/// nest assigns these indices for undef values positions. +/// As an example below Order has two undef positions and they have assigned +/// values 3 and 7 respectively: +/// before: 6 9 5 4 9 2 1 0 +/// after: 6 3 5 4 7 2 1 0 +/// \returns Fixed ordering. +static void fixupOrderingIndices(SmallVectorImpl &Order) { + const unsigned Sz = Order.size(); + SmallBitVector UsedIndices(Sz); + SmallVector MaskedIndices; + for (unsigned I = 0; I < Sz; ++I) { + if (Order[I] < Sz) + UsedIndices.set(Order[I]); + else + MaskedIndices.push_back(I); + } + if (MaskedIndices.empty()) + return; + SmallVector AvailableIndices(MaskedIndices.size()); + unsigned Cnt = 0; + int Idx = UsedIndices.find_first(); + do { + AvailableIndices[Cnt] = Idx; + Idx = UsedIndices.find_next(Idx); + ++Cnt; + } while (Idx > 0); + assert(Cnt == MaskedIndices.size() && "Non-synced masked/available indices."); + for (int I = 0, E = MaskedIndices.size(); I < E; ++I) + Order[MaskedIndices[I]] = AvailableIndices[I]; +} + void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, const EdgeInfo &UserTreeIdx) { assert((allConstant(VL) || allSameType(VL)) && "Invalid types!"); @@ -2776,7 +3265,7 @@ } // Check that every instruction appears once in this bundle. - SmallVector ReuseShuffleIndicies; + SmallVector ReuseShuffleIndicies; SmallVector UniqueValues; DenseMap UniquePositions; for (Value *V : VL) { @@ -2870,7 +3359,6 @@ bool Reuse = canReuseExtract(VL, VL0, CurrentOrder); if (Reuse) { LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n"); - ++NumOpsWantToKeepOriginalOrder; newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); // This is a special case, as it does not gather, but at the same time @@ -2888,12 +3376,11 @@ dbgs() << " " << Idx; dbgs() << "\n"; }); + fixupOrderingIndices(CurrentOrder); // Insert new order with initial value 0, if it does not exist, // otherwise return the iterator to the existing one. newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies, CurrentOrder); - findRootOrder(CurrentOrder); - ++NumOpsWantToKeepOrder[CurrentOrder]; // This is a special case, as it does not gather, but at the same time // we are not extending buildTree_rec() towards the operands. ValueList Op0; @@ -2913,8 +3400,14 @@ // Check that we have a buildvector and not a shuffle of 2 or more // different vectors. ValueSet SourceVectors; - for (Value *V : VL) + int MinIdx = std::numeric_limits::max(); + for (Value *V : VL) { SourceVectors.insert(cast(V)->getOperand(0)); + Optional Idx = *getInsertIndex(V, 0); + if (!Idx || *Idx == UndefMaskElem) + continue; + MinIdx = std::min(MinIdx, *Idx); + } if (count_if(VL, [&SourceVectors](Value *V) { return !SourceVectors.contains(V); @@ -2922,13 +3415,35 @@ // Found 2nd source vector - cancel. LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with " "different source vectors.\n"); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); BS.cancelScheduling(VL, VL0); return; } - TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx); + auto OrdCompare = [](const std::pair &P1, + const std::pair &P2) { + return P1.first > P2.first; + }; + PriorityQueue, SmallVector>, + decltype(OrdCompare)> + Indeces(OrdCompare); + for (int I = 0, E = VL.size(); I < E; ++I) { + Optional Idx = *getInsertIndex(VL[I], 0); + if (!Idx || *Idx == UndefMaskElem) + continue; + Indeces.emplace(*Idx, I); + } + OrdersType CurrentOrder(VL.size(), VL.size()); + bool IsIdentity = true; + for (int I = 0, E = VL.size(); I < E; ++I) { + CurrentOrder[Indeces.top().second] = I; + IsIdentity &= Indeces.top().second == I; + Indeces.pop(); + } + if (IsIdentity) + CurrentOrder.clear(); + TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, + None, CurrentOrder); LLVM_DEBUG(dbgs() << "SLP: added inserts bundle.\n"); constexpr int NumOps = 2; @@ -2949,90 +3464,52 @@ // treats loading/storing it as an i8 struct. If we vectorize loads/stores // from such a struct, we read/write packed bits disagreeing with the // unvectorized version. - Type *ScalarTy = VL0->getType(); - - if (DL->getTypeSizeInBits(ScalarTy) != - DL->getTypeAllocSizeInBits(ScalarTy)) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); - LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n"); - return; - } - - // Make sure all loads in the bundle are simple - we can't vectorize - // atomic or volatile loads. - SmallVector PointerOps(VL.size()); - auto POIter = PointerOps.begin(); - for (Value *V : VL) { - auto *L = cast(V); - if (!L->isSimple()) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); - LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); - return; - } - *POIter = L->getPointerOperand(); - ++POIter; - } - + SmallVector PointerOps; OrdersType CurrentOrder; - // Check the order of pointer operands. - if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) { - Value *Ptr0; - Value *PtrN; + TreeEntry *TE = nullptr; + switch (canVectorizeLoads(VL, VL0, *TTI, *DL, *SE, CurrentOrder, + PointerOps)) { + case LoadsState::Vectorize: if (CurrentOrder.empty()) { - Ptr0 = PointerOps.front(); - PtrN = PointerOps.back(); + // Original loads are consecutive and does not require reordering. + TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); + LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n"); } else { - Ptr0 = PointerOps[CurrentOrder.front()]; - PtrN = PointerOps[CurrentOrder.back()]; - } - Optional Diff = getPointersDiff( - ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE); - // Check that the sorted loads are consecutive. - if (static_cast(*Diff) == VL.size() - 1) { - if (CurrentOrder.empty()) { - // Original loads are consecutive and does not require reordering. - ++NumOpsWantToKeepOriginalOrder; - TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, - UserTreeIdx, ReuseShuffleIndicies); - TE->setOperandsInOrder(); - LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n"); - } else { - // Need to reorder. - TreeEntry *TE = - newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies, CurrentOrder); - TE->setOperandsInOrder(); - LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n"); - findRootOrder(CurrentOrder); - ++NumOpsWantToKeepOrder[CurrentOrder]; - } - return; - } - Align CommonAlignment = cast(VL0)->getAlign(); - for (Value *V : VL) - CommonAlignment = - commonAlignment(CommonAlignment, cast(V)->getAlign()); - if (TTI->isLegalMaskedGather(FixedVectorType::get(ScalarTy, VL.size()), - CommonAlignment)) { - // Vectorizing non-consecutive loads with `llvm.masked.gather`. - TreeEntry *TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, - S, UserTreeIdx, ReuseShuffleIndicies); - TE->setOperandsInOrder(); - buildTree_rec(PointerOps, Depth + 1, {TE, 0}); - LLVM_DEBUG(dbgs() - << "SLP: added a vector of non-consecutive loads.\n"); - return; + fixupOrderingIndices(CurrentOrder); + // Need to reorder. + TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies, CurrentOrder); + LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n"); } + TE->setOperandsInOrder(); + break; + case LoadsState::ScatterVectorize: + // Vectorizing non-consecutive loads with `llvm.masked.gather`. + TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S, + UserTreeIdx, ReuseShuffleIndicies); + TE->setOperandsInOrder(); + buildTree_rec(PointerOps, Depth + 1, {TE, 0}); + LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n"); + break; + case LoadsState::Gather: + BS.cancelScheduling(VL, VL0); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); +#ifndef NDEBUG + Type *ScalarTy = VL0->getType(); + if (DL->getTypeSizeInBits(ScalarTy) != + DL->getTypeAllocSizeInBits(ScalarTy)) + LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n"); + else if (any_of(VL, [](Value *V) { + return !cast(V)->isSimple(); + })) + LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); + else + LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); +#endif // NDEBUG + break; } - - LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); return; } case Instruction::ZExt: @@ -3279,21 +3756,19 @@ if (static_cast(*Dist) == VL.size() - 1) { if (CurrentOrder.empty()) { // Original stores are consecutive and does not require reordering. - ++NumOpsWantToKeepOriginalOrder; TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); TE->setOperandsInOrder(); buildTree_rec(Operands, Depth + 1, {TE, 0}); LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n"); } else { + fixupOrderingIndices(CurrentOrder); TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies, CurrentOrder); TE->setOperandsInOrder(); buildTree_rec(Operands, Depth + 1, {TE, 0}); LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n"); - findRootOrder(CurrentOrder); - ++NumOpsWantToKeepOrder[CurrentOrder]; } return; } @@ -3632,19 +4107,55 @@ Mask.append(SubMask.begin(), SubMask.end()); return; } - SmallVector NewMask(SubMask.size(), SubMask.size()); + SmallVector NewMask(SubMask.size(), UndefMaskElem); int TermValue = std::min(Mask.size(), SubMask.size()); for (int I = 0, E = SubMask.size(); I < E; ++I) { if (SubMask[I] >= TermValue || SubMask[I] == UndefMaskElem || - Mask[SubMask[I]] >= TermValue) { - NewMask[I] = UndefMaskElem; + Mask[SubMask[I]] >= TermValue) continue; - } NewMask[I] = Mask[SubMask[I]]; } Mask.swap(NewMask); } +/// Build shuffle mask for shuffle graph entries and lists of main and alternate +/// operations operands. +static void +buildSuffleEntryMask(ArrayRef VL, ArrayRef ReorderIndices, + ArrayRef ReusesIndices, + const function_ref IsAltOp, + SmallVectorImpl &Mask, + SmallVectorImpl *OpScalars = nullptr, + SmallVectorImpl *AltScalars = nullptr) { + unsigned Sz = VL.size(); + Mask.assign(Sz, UndefMaskElem); + SmallVector OrderMask; + if (!ReorderIndices.empty()) + inversePermutation(ReorderIndices, OrderMask); + for (unsigned I = 0; I < Sz; ++I) { + unsigned Idx = I; + if (!ReorderIndices.empty()) + Idx = OrderMask[I]; + auto *OpInst = cast(VL[I]); + if (IsAltOp(OpInst)) { + Mask[Idx] = Sz + I; + if (AltScalars) + AltScalars->push_back(OpInst); + } else { + Mask[Idx] = I; + if (OpScalars) + OpScalars->push_back(OpInst); + } + } + if (!ReusesIndices.empty()) { + SmallVector NewMask(ReusesIndices.size(), UndefMaskElem); + transform(ReusesIndices, NewMask.begin(), [&Mask](int Idx) { + return Idx != UndefMaskElem ? Mask[Idx] : UndefMaskElem; + }); + Mask.swap(NewMask); + } +} + InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals) { ArrayRef VL = E->Scalars; @@ -3806,6 +4317,86 @@ if (NeedToShuffleReuses) ReuseShuffleCost = TTI->getShuffleCost( TTI::SK_PermuteSingleSrc, FinalVecTy, E->ReuseShuffleIndices); + // Improve gather cost for gather of loads, if we can group some of the + // loads into vector loads. + if (VL.size() > 2 && E->getOpcode() == Instruction::Load && + !E->isAltShuffle()) { + BoUpSLP::ValueSet VectorizedLoads; + unsigned StartIdx = 0; + unsigned VF = VL.size() / 2; + unsigned VectorizedCnt = 0; + unsigned ScatterVectorizeCnt = 0; + const unsigned Sz = DL->getTypeSizeInBits(E->getMainOp()->getType()); + for (unsigned MinVF = getMinVF(2 * Sz); VF >= MinVF; VF /= 2) { + for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End; + Cnt += VF) { + ArrayRef Slice = VL.slice(Cnt, VF); + if (!VectorizedLoads.count(Slice.front()) && + !VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) { + SmallVector PointerOps; + OrdersType CurrentOrder; + LoadsState LS = canVectorizeLoads(Slice, Slice.front(), *TTI, *DL, + *SE, CurrentOrder, PointerOps); + switch (LS) { + case LoadsState::Vectorize: + case LoadsState::ScatterVectorize: + // Mark the vectorized loads so that we don't vectorize them + // again. + if (LS == LoadsState::Vectorize) + ++VectorizedCnt; + else + ++ScatterVectorizeCnt; + VectorizedLoads.insert(Slice.begin(), Slice.end()); + // If we vectorized initial block, no need to try to vectorize it + // again. + if (Cnt == StartIdx) + StartIdx += VF; + break; + case LoadsState::Gather: + break; + } + } + } + // Check if the whole array was vectorized already - exit. + if (StartIdx >= VL.size()) + break; + // Found vectorizable parts - exit. + if (!VectorizedLoads.empty()) + break; + } + if (!VectorizedLoads.empty()) { + InstructionCost GatherCost = 0; + // Get the cost for gathered loads. + for (unsigned I = 0, End = VL.size(); I < End; I += VF) { + if (VectorizedLoads.contains(VL[I])) + continue; + GatherCost += getGatherCost(VL.slice(I, VF)); + } + // The cost for vectorized loads. + InstructionCost ScalarsCost = 0; + for (Value *V : VectorizedLoads) { + auto *LI = cast(V); + ScalarsCost += TTI->getMemoryOpCost( + Instruction::Load, LI->getType(), LI->getAlign(), + LI->getPointerAddressSpace(), CostKind, LI); + } + auto *LI = cast(E->getMainOp()); + auto *LoadTy = FixedVectorType::get(LI->getType(), VF); + Align Alignment = LI->getAlign(); + GatherCost += + VectorizedCnt * + TTI->getMemoryOpCost(Instruction::Load, LoadTy, Alignment, + LI->getPointerAddressSpace(), CostKind, LI); + GatherCost += ScatterVectorizeCnt * + TTI->getGatherScatterOpCost( + Instruction::Load, LoadTy, LI->getPointerOperand(), + /*VariableMask=*/false, Alignment, CostKind, LI); + // Add the cost for the subvectors shuffling. + GatherCost += ((VL.size() - VF) / VF) * + TTI->getShuffleCost(TTI::SK_Select, VecTy); + return ReuseShuffleCost + GatherCost - ScalarsCost; + } + } return ReuseShuffleCost + getGatherCost(VL); } InstructionCost CommonCost = 0; @@ -3898,29 +4489,33 @@ return CommonCost; } case Instruction::InsertElement: { + assert(E->ReuseShuffleIndices.empty() && + "Unique insertelements only are expected."); auto *SrcVecTy = cast(VL0->getType()); unsigned const NumElts = SrcVecTy->getNumElements(); unsigned const NumScalars = VL.size(); APInt DemandedElts = APInt::getNullValue(NumElts); // TODO: Add support for Instruction::InsertValue. - unsigned Offset = UINT_MAX; + SmallVector Mask; + if (!E->ReorderIndices.empty()) { + inversePermutation(E->ReorderIndices, Mask); + Mask.append(NumElts - NumScalars, UndefMaskElem); + } else { + Mask.assign(NumElts, UndefMaskElem); + std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0); + } + unsigned Offset = *getInsertIndex(VL0, 0); bool IsIdentity = true; - SmallVector ShuffleMask(NumElts, UndefMaskElem); + SmallVector PrevMask(NumElts, UndefMaskElem); + Mask.swap(PrevMask); for (unsigned I = 0; I < NumScalars; ++I) { - Optional InsertIdx = getInsertIndex(VL[I], 0); + Optional InsertIdx = getInsertIndex(VL[PrevMask[I]], 0); if (!InsertIdx || *InsertIdx == UndefMaskElem) continue; - unsigned Idx = *InsertIdx; - DemandedElts.setBit(Idx); - if (Idx < Offset) { - Offset = Idx; - IsIdentity &= I == 0; - } else { - assert(Idx >= Offset && "Failed to find vector index offset"); - IsIdentity &= Idx - Offset == I; - } - ShuffleMask[Idx] = I; + DemandedElts.setBit(*InsertIdx); + IsIdentity &= *InsertIdx - Offset == I; + Mask[*InsertIdx - Offset] = I; } assert(Offset < NumElts && "Failed to find vector index offset"); @@ -3935,8 +4530,23 @@ TargetTransformInfo::SK_PermuteSingleSrc, FixedVectorType::get(SrcVecTy->getElementType(), Sz)); } else if (!IsIdentity) { - Cost += TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, SrcVecTy, - ShuffleMask); + auto *FirstInsert = + cast(*find_if(E->Scalars, [E](Value *V) { + return !is_contained(E->Scalars, + cast(V)->getOperand(0)); + })); + if (isa(FirstInsert)) { + Cost += TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, SrcVecTy, Mask); + } else { + SmallVector InsertMask(NumElts); + std::iota(InsertMask.begin(), InsertMask.end(), 0); + for (unsigned I = 0; I < NumElts; I++) { + if (Mask[I] != UndefMaskElem) + InsertMask[Offset + I] = NumElts + I; + } + Cost += + TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVecTy, InsertMask); + } } return Cost; @@ -4218,14 +4828,16 @@ TTI::CastContextHint::None, CostKind); } - SmallVector Mask(E->Scalars.size()); - for (unsigned I = 0, End = E->Scalars.size(); I < End; ++I) { - auto *OpInst = cast(E->Scalars[I]); - assert(E->isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode"); - Mask[I] = I + (OpInst->getOpcode() == E->getAltOpcode() ? End : 0); - } - VecCost += - TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, Mask, 0); + SmallVector Mask; + buildSuffleEntryMask( + E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices, + [E](Instruction *I) { + assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); + return I->getOpcode() == E->getAltOpcode(); + }, + Mask); + CommonCost = + TTI->getShuffleCost(TargetTransformInfo::SK_Select, FinalVecTy, Mask); LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost)); return CommonCost + VecCost - ScalarCost; } @@ -5148,13 +5760,17 @@ auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size()); switch (ShuffleOrOp) { case Instruction::PHI: { + assert( + (E->ReorderIndices.empty() || E != VectorizableTree.front().get()) && + "PHI reordering is free."); auto *PH = cast(VL0); Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI()); Builder.SetCurrentDebugLocation(PH->getDebugLoc()); PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues()); Value *V = NewPhi; - if (NeedToShuffleReuses) - V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); + ShuffleBuilder.addInversedMask(E->ReorderIndices); + ShuffleBuilder.addMask(E->ReuseShuffleIndices); + V = ShuffleBuilder.finalize(V); E->VectorizedValue = V; @@ -5205,48 +5821,41 @@ return NewV; } case Instruction::InsertElement: { + assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique"); Builder.SetInsertPoint(VL0); Value *V = vectorizeTree(E->getOperand(1)); + // Create InsertVector shuffle if necessary + auto *FirstInsert = cast(*find_if(E->Scalars, [E](Value *V) { + return !is_contained(E->Scalars, cast(V)->getOperand(0)); + })); const unsigned NumElts = - cast(VL0->getType())->getNumElements(); + cast(FirstInsert->getType())->getNumElements(); const unsigned NumScalars = E->Scalars.size(); + unsigned Offset = *getInsertIndex(VL0, 0); + assert(Offset < NumElts && "Failed to find vector index offset"); + + // Create shuffle to resize vector + SmallVector Mask; + if (!E->ReorderIndices.empty()) { + inversePermutation(E->ReorderIndices, Mask); + Mask.append(NumElts - NumScalars, UndefMaskElem); + } else { + Mask.assign(NumElts, UndefMaskElem); + std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0); + } // Create InsertVector shuffle if necessary - Instruction *FirstInsert = nullptr; bool IsIdentity = true; - unsigned Offset = UINT_MAX; + SmallVector PrevMask(NumElts, UndefMaskElem); + Mask.swap(PrevMask); for (unsigned I = 0; I < NumScalars; ++I) { - Value *Scalar = E->Scalars[I]; - if (!FirstInsert && - !is_contained(E->Scalars, cast(Scalar)->getOperand(0))) - FirstInsert = cast(Scalar); + Value *Scalar = E->Scalars[PrevMask[I]]; Optional InsertIdx = getInsertIndex(Scalar, 0); if (!InsertIdx || *InsertIdx == UndefMaskElem) continue; - unsigned Idx = *InsertIdx; - if (Idx < Offset) { - Offset = Idx; - IsIdentity &= I == 0; - } else { - assert(Idx >= Offset && "Failed to find vector index offset"); - IsIdentity &= Idx - Offset == I; - } - } - assert(Offset < NumElts && "Failed to find vector index offset"); - - // Create shuffle to resize vector - SmallVector Mask(NumElts, UndefMaskElem); - if (!IsIdentity) { - for (unsigned I = 0; I < NumScalars; ++I) { - Value *Scalar = E->Scalars[I]; - Optional InsertIdx = getInsertIndex(Scalar, 0); - if (!InsertIdx || *InsertIdx == UndefMaskElem) - continue; - Mask[*InsertIdx - Offset] = I; - } - } else { - std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0); + IsIdentity &= *InsertIdx - Offset == I; + Mask[*InsertIdx - Offset] = I; } if (!IsIdentity || NumElts != NumScalars) V = Builder.CreateShuffleVector(V, Mask); @@ -5293,6 +5902,7 @@ auto *CI = cast(VL0); Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy); + ShuffleBuilder.addInversedMask(E->ReorderIndices); ShuffleBuilder.addMask(E->ReuseShuffleIndices); V = ShuffleBuilder.finalize(V); @@ -5315,6 +5925,7 @@ CmpInst::Predicate P0 = cast(VL0)->getPredicate(); Value *V = Builder.CreateCmp(P0, L, R); propagateIRFlags(V, E->Scalars, VL0); + ShuffleBuilder.addInversedMask(E->ReorderIndices); ShuffleBuilder.addMask(E->ReuseShuffleIndices); V = ShuffleBuilder.finalize(V); @@ -5335,6 +5946,7 @@ } Value *V = Builder.CreateSelect(Cond, True, False); + ShuffleBuilder.addInversedMask(E->ReorderIndices); ShuffleBuilder.addMask(E->ReuseShuffleIndices); V = ShuffleBuilder.finalize(V); @@ -5358,6 +5970,7 @@ if (auto *I = dyn_cast(V)) V = propagateMetadata(I, E->Scalars); + ShuffleBuilder.addInversedMask(E->ReorderIndices); ShuffleBuilder.addMask(E->ReuseShuffleIndices); V = ShuffleBuilder.finalize(V); @@ -5401,6 +6014,7 @@ if (auto *I = dyn_cast(V)) V = propagateMetadata(I, E->Scalars); + ShuffleBuilder.addInversedMask(E->ReorderIndices); ShuffleBuilder.addMask(E->ReuseShuffleIndices); V = ShuffleBuilder.finalize(V); @@ -5412,9 +6026,6 @@ case Instruction::Load: { // Loads are inserted at the head of the tree because we don't want to // sink them all the way down past store instructions. - bool IsReorder = E->updateStateIfReorder(); - if (IsReorder) - VL0 = E->getMainOp(); setInsertPointAfterBundle(E); LoadInst *LI = cast(VL0); @@ -5452,9 +6063,7 @@ return V; } case Instruction::Store: { - bool IsReorder = !E->ReorderIndices.empty(); - auto *SI = cast( - IsReorder ? E->Scalars[E->ReorderIndices.front()] : VL0); + auto *SI = cast(VL0); unsigned AS = SI->getPointerAddressSpace(); setInsertPointAfterBundle(E); @@ -5513,6 +6122,7 @@ if (Instruction *I = dyn_cast(V)) V = propagateMetadata(I, E->Scalars); + ShuffleBuilder.addInversedMask(E->ReorderIndices); ShuffleBuilder.addMask(E->ReuseShuffleIndices); V = ShuffleBuilder.finalize(V); @@ -5579,6 +6189,7 @@ ExternalUses.push_back(ExternalUser(ScalarArg, cast(V), 0)); propagateIRFlags(V, E->Scalars, VL0); + ShuffleBuilder.addInversedMask(E->ReorderIndices); ShuffleBuilder.addMask(E->ReuseShuffleIndices); V = ShuffleBuilder.finalize(V); @@ -5626,19 +6237,14 @@ // Also, gather up main and alt scalar ops to propagate IR flags to // each vector operation. ValueList OpScalars, AltScalars; - unsigned Sz = E->Scalars.size(); - SmallVector Mask(Sz); - for (unsigned I = 0; I < Sz; ++I) { - auto *OpInst = cast(E->Scalars[I]); - assert(E->isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode"); - if (OpInst->getOpcode() == E->getAltOpcode()) { - Mask[I] = Sz + I; - AltScalars.push_back(E->Scalars[I]); - } else { - Mask[I] = I; - OpScalars.push_back(E->Scalars[I]); - } - } + SmallVector Mask; + buildSuffleEntryMask( + E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices, + [E](Instruction *I) { + assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); + return I->getOpcode() == E->getAltOpcode(); + }, + Mask, &OpScalars, &AltScalars); propagateIRFlags(V0, OpScalars); propagateIRFlags(V1, AltScalars); @@ -5646,7 +6252,6 @@ Value *V = Builder.CreateShuffleVector(V0, V1, Mask); if (Instruction *I = dyn_cast(V)) V = propagateMetadata(I, E->Scalars); - ShuffleBuilder.addMask(E->ReuseShuffleIndices); V = ShuffleBuilder.finalize(V); E->VectorizedValue = V; @@ -6819,44 +7424,6 @@ return Changed; } -/// Order may have elements assigned special value (size) which is out of -/// bounds. Such indices only appear on places which correspond to undef values -/// (see canReuseExtract for details) and used in order to avoid undef values -/// have effect on operands ordering. -/// The first loop below simply finds all unused indices and then the next loop -/// nest assigns these indices for undef values positions. -/// As an example below Order has two undef positions and they have assigned -/// values 3 and 7 respectively: -/// before: 6 9 5 4 9 2 1 0 -/// after: 6 3 5 4 7 2 1 0 -/// \returns Fixed ordering. -static BoUpSLP::OrdersType fixupOrderingIndices(ArrayRef Order) { - BoUpSLP::OrdersType NewOrder(Order.begin(), Order.end()); - const unsigned Sz = NewOrder.size(); - SmallBitVector UsedIndices(Sz); - SmallVector MaskedIndices; - for (int I = 0, E = NewOrder.size(); I < E; ++I) { - if (NewOrder[I] < Sz) - UsedIndices.set(NewOrder[I]); - else - MaskedIndices.push_back(I); - } - if (MaskedIndices.empty()) - return NewOrder; - SmallVector AvailableIndices(MaskedIndices.size()); - unsigned Cnt = 0; - int Idx = UsedIndices.find_first(); - do { - AvailableIndices[Cnt] = Idx; - Idx = UsedIndices.find_next(Idx); - ++Cnt; - } while (Idx > 0); - assert(Cnt == MaskedIndices.size() && "Non-synced masked/available indices."); - for (int I = 0, E = MaskedIndices.size(); I < E; ++I) - NewOrder[MaskedIndices[I]] = AvailableIndices[I]; - return NewOrder; -} - bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef Chain, BoUpSLP &R, unsigned Idx) { LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size() @@ -6872,19 +7439,13 @@ << "\n"); R.buildTree(Chain); - Optional> Order = R.bestOrder(); - // TODO: Handle orders of size less than number of elements in the vector. - if (Order && Order->size() == Chain.size()) { - // TODO: reorder tree nodes without tree rebuilding. - SmallVector ReorderedOps(Chain.size()); - transform(fixupOrderingIndices(*Order), ReorderedOps.begin(), - [Chain](const unsigned Idx) { return Chain[Idx]; }); - R.buildTree(ReorderedOps); - } if (R.isTreeTinyAndNotFullyVectorizable()) return false; if (R.isLoadCombineCandidate()) return false; + R.reorderTopToBottom(); + R.reorderBottomToTop(); + R.buildExternalUses(); R.computeMinimumValueSizes(); @@ -7007,7 +7568,7 @@ unsigned EltSize = R.getVectorElementSize(Operands[0]); unsigned MaxElts = llvm::PowerOf2Floor(MaxVecRegSize / EltSize); - unsigned MinVF = std::max(2U, R.getMinVecRegSize() / EltSize); + unsigned MinVF = R.getMinVF(EltSize); unsigned MaxVF = std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts); @@ -7080,11 +7641,10 @@ if (!A || !B) return false; Value *VL[] = {A, B}; - return tryToVectorizeList(VL, R, /*AllowReorder=*/true); + return tryToVectorizeList(VL, R); } -bool SLPVectorizerPass::tryToVectorizeList(ArrayRef VL, BoUpSLP &R, - bool AllowReorder) { +bool SLPVectorizerPass::tryToVectorizeList(ArrayRef VL, BoUpSLP &R) { if (VL.size() < 2) return false; @@ -7118,7 +7678,7 @@ } unsigned Sz = R.getVectorElementSize(I0); - unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz); + unsigned MinVF = R.getMinVF(Sz); unsigned MaxVF = std::max(PowerOf2Floor(VL.size()), MinVF); MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF); if (MaxVF < 2) { @@ -7171,18 +7731,11 @@ << "\n"); R.buildTree(Ops); - if (AllowReorder) { - Optional> Order = R.bestOrder(); - if (Order) { - // TODO: reorder tree nodes without tree rebuilding. - SmallVector ReorderedOps(Ops.size()); - transform(fixupOrderingIndices(*Order), ReorderedOps.begin(), - [Ops](const unsigned Idx) { return Ops[Idx]; }); - R.buildTree(ReorderedOps); - } - } if (R.isTreeTinyAndNotFullyVectorizable()) continue; + R.reorderTopToBottom(); + R.reorderBottomToTop(); + R.buildExternalUses(); R.computeMinimumValueSizes(); InstructionCost Cost = R.getTreeCost(); @@ -7826,22 +8379,14 @@ unsigned i = 0; while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) { ArrayRef VL(&ReducedVals[i], ReduxWidth); - V.buildTree(VL, ExternallyUsedValues, IgnoreList); - Optional> Order = V.bestOrder(); - if (Order) { - assert(Order->size() == VL.size() && - "Order size must be the same as number of vectorized " - "instructions."); - // TODO: reorder tree nodes without tree rebuilding. - SmallVector ReorderedOps(VL.size()); - transform(fixupOrderingIndices(*Order), ReorderedOps.begin(), - [VL](const unsigned Idx) { return VL[Idx]; }); - V.buildTree(ReorderedOps, ExternallyUsedValues, IgnoreList); - } + V.buildTree(VL, IgnoreList); if (V.isTreeTinyAndNotFullyVectorizable()) break; if (V.isLoadCombineReductionCandidate(RdxKind)) break; + V.reorderTopToBottom(); + V.reorderBottomToTop(); + V.buildExternalUses(ExternallyUsedValues); // For a poison-safe boolean logic reduction, do not replace select // instructions with logic ops. All reduced values will be frozen (see @@ -8314,7 +8859,7 @@ LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n"); // Aggregate value is unlikely to be processed in vector register, we need to // extract scalars into scalar registers, so NeedExtraction is set true. - return tryToVectorizeList(BuildVectorOpds, R, /*AllowReorder=*/false); + return tryToVectorizeList(BuildVectorOpds, R); } bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI, @@ -8329,7 +8874,7 @@ return false; LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n"); - return tryToVectorizeList(BuildVectorInsts, R, /*AllowReorder=*/true); + return tryToVectorizeList(BuildVectorInsts, R); } bool SLPVectorizerPass::vectorizeSimpleInstructions( @@ -8521,8 +9066,7 @@ // So allow tryToVectorizeList to reorder them if it is beneficial. This // is done when there are exactly two elements since tryToVectorizeList // asserts that there are only two values when AllowReorder is true. - if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R, - /*AllowReorder=*/true)) { + if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R)) { // Success start over because instructions might have been changed. HaveVectorizedPhiNodes = true; Changed = true; @@ -8533,8 +9077,7 @@ } // Final attempt to vectorize phis with the same types. if (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType()) { - if (Candidates.size() > 1 && - tryToVectorizeList(Candidates, R, /*AllowReorder=*/true)) { + if (Candidates.size() > 1 && tryToVectorizeList(Candidates, R)) { // Success start over because instructions might have been changed. HaveVectorizedPhiNodes = true; Changed = true; diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll @@ -6,16 +6,14 @@ define <2 x i64> @build_vec_v2i64(<2 x i64> %v0, <2 x i64> %v1) { ; CHECK-LABEL: @build_vec_v2i64( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[V0:%.*]], <2 x i64> undef, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[V1:%.*]], <2 x i64> undef, <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = sub <2 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP7:%.*]] = sub <2 x i64> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> [[TMP7]], <2 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i64> [[TMP8]], [[TMP5]] -; CHECK-NEXT: ret <2 x i64> [[TMP9]] +; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i64> [[V0:%.*]], [[V1:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i64> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP5:%.*]] = sub <2 x i64> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <2 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i64> [[TMP6]], [[TMP3]] +; CHECK-NEXT: ret <2 x i64> [[TMP7]] ; %v0.0 = extractelement <2 x i64> %v0, i32 0 %v0.1 = extractelement <2 x i64> %v0, i32 1 @@ -74,16 +72,14 @@ define <4 x i32> @build_vec_v4i32(<4 x i32> %v0, <4 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP7:%.*]] = sub <4 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP8]], [[TMP5]] -; CHECK-NEXT: ret <4 x i32> [[TMP9]] +; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[V0:%.*]], [[V1:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = sub <4 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP5:%.*]] = sub <4 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i32> [[TMP6]], [[TMP3]] +; CHECK-NEXT: ret <4 x i32> [[TMP7]] ; %v0.0 = extractelement <4 x i32> %v0, i32 0 %v0.1 = extractelement <4 x i32> %v0, i32 1 @@ -114,16 +110,14 @@ define <4 x i32> @build_vec_v4i32_reuse_0(<2 x i32> %v0, <2 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32_reuse_0( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> undef, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> undef, <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = sub <2 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP7:%.*]] = sub <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> [[TMP7]], <2 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i32> [[TMP8]], [[TMP5]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i32> [[V0:%.*]], [[V1:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP5:%.*]] = sub <2 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i32> [[TMP6]], [[TMP3]] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[SHUFFLE]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 @@ -229,22 +223,20 @@ define i32 @reduction_v4i32(<4 x i32> %v0, <4 x i32> %v1) { ; CHECK-LABEL: @reduction_v4i32( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP8]], [[TMP5]] -; CHECK-NEXT: [[TMP10:%.*]] = lshr <4 x i32> [[TMP9]], -; CHECK-NEXT: [[TMP11:%.*]] = and <4 x i32> [[TMP10]], -; CHECK-NEXT: [[TMP12:%.*]] = mul nuw <4 x i32> [[TMP11]], -; CHECK-NEXT: [[TMP13:%.*]] = add <4 x i32> [[TMP12]], [[TMP9]] -; CHECK-NEXT: [[TMP14:%.*]] = xor <4 x i32> [[TMP13]], [[TMP12]] -; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP14]]) -; CHECK-NEXT: ret i32 [[TMP15]] +; CHECK-NEXT: [[TMP1:%.*]] = sub <4 x i32> [[V0:%.*]], [[V1:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = sub <4 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i32> [[TMP6]], [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = lshr <4 x i32> [[TMP7]], +; CHECK-NEXT: [[TMP9:%.*]] = and <4 x i32> [[TMP8]], +; CHECK-NEXT: [[TMP10:%.*]] = mul nuw <4 x i32> [[TMP9]], +; CHECK-NEXT: [[TMP11:%.*]] = add <4 x i32> [[TMP10]], [[TMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = xor <4 x i32> [[TMP11]], [[TMP10]] +; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP12]]) +; CHECK-NEXT: ret i32 [[TMP13]] ; %v0.0 = extractelement <4 x i32> %v0, i32 0 %v0.1 = extractelement <4 x i32> %v0, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll @@ -6,16 +6,14 @@ define <2 x i64> @build_vec_v2i64(<2 x i64> %v0, <2 x i64> %v1) { ; CHECK-LABEL: @build_vec_v2i64( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[V0:%.*]], <2 x i64> undef, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[V1:%.*]], <2 x i64> undef, <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = sub <2 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP7:%.*]] = sub <2 x i64> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> [[TMP7]], <2 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i64> [[TMP8]], [[TMP5]] -; CHECK-NEXT: ret <2 x i64> [[TMP9]] +; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i64> [[V0:%.*]], [[V1:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i64> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP5:%.*]] = sub <2 x i64> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <2 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i64> [[TMP6]], [[TMP3]] +; CHECK-NEXT: ret <2 x i64> [[TMP7]] ; %v0.0 = extractelement <2 x i64> %v0, i32 0 %v0.1 = extractelement <2 x i64> %v0, i32 1 @@ -74,16 +72,14 @@ define <4 x i32> @build_vec_v4i32(<4 x i32> %v0, <4 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP7:%.*]] = sub <4 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP8]], [[TMP5]] -; CHECK-NEXT: ret <4 x i32> [[TMP9]] +; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[V0:%.*]], [[V1:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = sub <4 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP5:%.*]] = sub <4 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i32> [[TMP6]], [[TMP3]] +; CHECK-NEXT: ret <4 x i32> [[TMP7]] ; %v0.0 = extractelement <4 x i32> %v0, i32 0 %v0.1 = extractelement <4 x i32> %v0, i32 1 @@ -114,16 +110,14 @@ define <4 x i32> @build_vec_v4i32_reuse_0(<2 x i32> %v0, <2 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32_reuse_0( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> undef, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> undef, <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = sub <2 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP7:%.*]] = sub <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> [[TMP7]], <2 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i32> [[TMP8]], [[TMP5]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i32> [[V0:%.*]], [[V1:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP5:%.*]] = sub <2 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i32> [[TMP6]], [[TMP3]] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[SHUFFLE]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 @@ -229,22 +223,20 @@ define i32 @reduction_v4i32(<4 x i32> %v0, <4 x i32> %v1) { ; CHECK-LABEL: @reduction_v4i32( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP8]], [[TMP5]] -; CHECK-NEXT: [[TMP10:%.*]] = lshr <4 x i32> [[TMP9]], -; CHECK-NEXT: [[TMP11:%.*]] = and <4 x i32> [[TMP10]], -; CHECK-NEXT: [[TMP12:%.*]] = mul nuw <4 x i32> [[TMP11]], -; CHECK-NEXT: [[TMP13:%.*]] = add <4 x i32> [[TMP12]], [[TMP9]] -; CHECK-NEXT: [[TMP14:%.*]] = xor <4 x i32> [[TMP13]], [[TMP12]] -; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP14]]) -; CHECK-NEXT: ret i32 [[TMP15]] +; CHECK-NEXT: [[TMP1:%.*]] = sub <4 x i32> [[V0:%.*]], [[V1:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = sub <4 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i32> [[TMP6]], [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = lshr <4 x i32> [[TMP7]], +; CHECK-NEXT: [[TMP9:%.*]] = and <4 x i32> [[TMP8]], +; CHECK-NEXT: [[TMP10:%.*]] = mul nuw <4 x i32> [[TMP9]], +; CHECK-NEXT: [[TMP11:%.*]] = add <4 x i32> [[TMP10]], [[TMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = xor <4 x i32> [[TMP11]], [[TMP10]] +; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP12]]) +; CHECK-NEXT: ret i32 [[TMP13]] ; %v0.0 = extractelement <4 x i32> %v0, i32 0 %v0.1 = extractelement <4 x i32> %v0, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll b/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll @@ -338,32 +338,26 @@ ret void } -; Dont vectorization of following code for float data type as sub is not commutative- -; fc[0] = fb[0]+fa[0]; -; fc[1] = fa[1]-fb[1]; -; fc[2] = fa[2]+fb[2]; -; fc[3] = fb[3]-fa[3]; -; In the above code we can swap the 1st and 2nd operation as fadd is commutative -; but not 2nd or 4th as fsub is not commutative. - -define void @no_vec_shuff_reorder() #0 { -; CHECK-LABEL: @no_vec_shuff_reorder( +define void @vec_shuff_reorder() #0 { +; CHECK-LABEL: @vec_shuff_reorder( ; CHECK-NEXT: [[TMP1:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 0), align 4 ; CHECK-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 0), align 4 -; CHECK-NEXT: [[TMP3:%.*]] = fadd float [[TMP1]], [[TMP2]] -; CHECK-NEXT: store float [[TMP3]], float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 0), align 4 -; CHECK-NEXT: [[TMP4:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 1), align 4 -; CHECK-NEXT: [[TMP5:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 1), align 4 -; CHECK-NEXT: [[TMP6:%.*]] = fsub float [[TMP4]], [[TMP5]] -; CHECK-NEXT: store float [[TMP6]], float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 1), align 4 -; CHECK-NEXT: [[TMP7:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 2), align 4 -; CHECK-NEXT: [[TMP8:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 2), align 4 -; CHECK-NEXT: [[TMP9:%.*]] = fadd float [[TMP7]], [[TMP8]] -; CHECK-NEXT: store float [[TMP9]], float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 2), align 4 -; CHECK-NEXT: [[TMP10:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 3), align 4 -; CHECK-NEXT: [[TMP11:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 3), align 4 -; CHECK-NEXT: [[TMP12:%.*]] = fsub float [[TMP10]], [[TMP11]] -; CHECK-NEXT: store float [[TMP12]], float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 3), align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 1) to <2 x float>*), align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 1) to <2 x float>*), align 4 +; CHECK-NEXT: [[TMP5:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 3), align 4 +; CHECK-NEXT: [[TMP6:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 3), align 4 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> [[TMP8]], <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP5]], i32 3 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x float> [[TMP11]], <4 x float> [[TMP12]], <4 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x float> [[TMP13]], float [[TMP6]], i32 3 +; CHECK-NEXT: [[TMP15:%.*]] = fadd <4 x float> [[TMP10]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = fsub <4 x float> [[TMP10]], [[TMP14]] +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x float> [[TMP15]], <4 x float> [[TMP16]], <4 x i32> +; CHECK-NEXT: store <4 x float> [[TMP17]], <4 x float>* bitcast ([4 x float]* @fc to <4 x float>*), align 4 ; CHECK-NEXT: ret void ; %1 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 0), align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll @@ -61,12 +61,12 @@ ; AVX-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[DEST:%.*]], i64 [[INDVARS_IV]] ; AVX-NEXT: store float [[ACC1_056]], float* [[ARRAYIDX2]], align 4 -; AVX-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> poison, <2 x i32> ; AVX-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0 ; AVX-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP1]], i32 1 -; AVX-NEXT: [[TMP4:%.*]] = fadd <2 x float> [[SHUFFLE]], [[TMP3]] +; AVX-NEXT: [[TMP4:%.*]] = fadd <2 x float> [[TMP0]], [[TMP3]] +; AVX-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> ; AVX-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP0]], zeroinitializer -; AVX-NEXT: [[TMP6:%.*]] = fadd <2 x float> [[TMP5]], [[TMP4]] +; AVX-NEXT: [[TMP6:%.*]] = fadd <2 x float> [[TMP5]], [[SHUFFLE]] ; AVX-NEXT: [[TMP7:%.*]] = fcmp olt <2 x float> [[TMP6]], ; AVX-NEXT: [[TMP8:%.*]] = select <2 x i1> [[TMP7]], <2 x float> [[TMP6]], <2 x float> ; AVX-NEXT: [[TMP9:%.*]] = fcmp olt <2 x float> [[TMP8]], diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/extract.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extract.ll @@ -30,11 +30,11 @@ ; CHECK-LABEL: @fextr1( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[LD:%.*]] = load <2 x double>, <2 x double>* undef, align 16 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[LD]], <2 x double> poison, <2 x i32> ; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds double, double* [[PTR:%.*]], i64 0 -; CHECK-NEXT: [[TMP0:%.*]] = fadd <2 x double> [[SHUFFLE]], +; CHECK-NEXT: [[TMP0:%.*]] = fadd <2 x double> [[LD]], +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[P1]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP0]], <2 x double>* [[TMP1]], align 4 +; CHECK-NEXT: store <2 x double> [[SHUFFLE]], <2 x double>* [[TMP1]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll b/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll @@ -8,12 +8,12 @@ ; CHECK-LABEL: @fn1( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([4 x i32]* @b to <4 x i32>*), align 4 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i32> [[SHUFFLE]], zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> , i32 [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP3]], <4 x i32> -; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast ([4 x i32]* @a to <4 x i32>*), align 4 +; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i32> [[TMP0]], zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> , i32 [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: store <4 x i32> [[SHUFFLE]], <4 x i32>* bitcast ([4 x i32]* @a to <4 x i32>*), align 4 ; CHECK-NEXT: ret i32 0 ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load.ll b/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load.ll @@ -11,21 +11,21 @@ ; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>* ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: [[INN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[INN:%.*]], i64 0 ; CHECK-NEXT: [[GEP_4:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 2 ; CHECK-NEXT: [[GEP_5:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 3 ; CHECK-NEXT: [[GEP_6:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 1 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[INN_ADDR]] to <4 x i32>* ; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i32> [[SHUFFLE]], [[SHUFFLE1]] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i32> [[TMP2]], [[SHUFFLE]] ; CHECK-NEXT: [[GEP_7:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 0 ; CHECK-NEXT: [[GEP_8:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 1 ; CHECK-NEXT: [[GEP_9:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 2 ; CHECK-NEXT: [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3 +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4 +; CHECK-NEXT: store <4 x i32> [[SHUFFLE1]], <4 x i32>* [[TMP6]], align 4 ; CHECK-NEXT: ret i32 undef ; %in.addr = getelementptr inbounds i32, i32* %in, i64 0 @@ -69,22 +69,22 @@ ; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>* ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP5]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 3 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0 ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP7]], i32 2 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP9]], i32 3 -; CHECK-NEXT: [[TMP11:%.*]] = mul <4 x i32> [[SHUFFLE]], [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = mul <4 x i32> [[TMP2]], [[TMP10]] ; CHECK-NEXT: [[GEP_7:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 0 ; CHECK-NEXT: [[GEP_8:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 1 ; CHECK-NEXT: [[GEP_9:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 2 ; CHECK-NEXT: [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* [[TMP12]], align 4 +; CHECK-NEXT: store <4 x i32> [[SHUFFLE]], <4 x i32>* [[TMP12]], align 4 ; CHECK-NEXT: ret i32 undef ; %in.addr = getelementptr inbounds i32, i32* %in, i64 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll b/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll @@ -26,32 +26,31 @@ ; CHECK-NEXT: [[TMP6:%.*]] = sitofp <2 x i32> [[TMP5]] to <2 x float> ; CHECK-NEXT: [[TMP7:%.*]] = fmul <2 x float> [[TMP6]], ; CHECK-NEXT: [[TMP8:%.*]] = fsub <2 x float> , [[TMP7]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 0 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 1 ; CHECK-NEXT: store float [[TMP9]], float* @g, align 4 -; CHECK-NEXT: [[TMP10:%.*]] = fadd <4 x float> [[SHUFFLE]], -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[TMP10]], i32 3 +; CHECK-NEXT: [[TMP10:%.*]] = fadd <4 x float> [[SHUFFLE]], +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[TMP10]], i32 2 ; CHECK-NEXT: store float [[TMP11]], float* @c, align 4 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[TMP10]], i32 2 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[TMP10]], i32 0 ; CHECK-NEXT: store float [[TMP12]], float* @d, align 4 -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP10]], i32 1 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP10]], i32 3 ; CHECK-NEXT: store float [[TMP13]], float* @e, align 4 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[TMP10]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[TMP10]], i32 1 ; CHECK-NEXT: store float [[TMP14]], float* @f, align 4 ; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 14 ; CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 15 ; CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* @a, align 4 ; CHECK-NEXT: [[CONV19:%.*]] = sitofp i32 [[TMP15]] to float -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x float> , float [[CONV19]], i32 2 -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 2 -; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x float> [[TMP16]], float [[TMP17]], i32 3 -; CHECK-NEXT: [[TMP19:%.*]] = fadd <4 x float> [[TMP10]], [[TMP18]] -; CHECK-NEXT: [[TMP20:%.*]] = fsub <4 x float> [[TMP10]], [[TMP18]] -; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <4 x float> [[TMP19]], <4 x float> [[TMP20]], <4 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x float> , float [[CONV19]], i32 0 +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x float> [[TMP16]], float [[TMP17]], i32 2 +; CHECK-NEXT: [[TMP19:%.*]] = fsub <4 x float> [[TMP10]], [[TMP18]] +; CHECK-NEXT: [[TMP20:%.*]] = fadd <4 x float> [[TMP10]], [[TMP18]] +; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <4 x float> [[TMP19]], <4 x float> [[TMP20]], <4 x i32> ; CHECK-NEXT: [[TMP22:%.*]] = fptosi <4 x float> [[TMP21]] to <4 x i32> -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP22]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP23:%.*]] = bitcast i32* [[ARRAYIDX1]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[SHUFFLE1]], <4 x i32>* [[TMP23]], align 4 +; CHECK-NEXT: store <4 x i32> [[TMP22]], <4 x i32>* [[TMP23]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll @@ -183,11 +183,11 @@ ; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] ; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[FROM:%.*]] to <2 x double>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 4 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP2]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[SHUFFLE]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 4 ; CHECK-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]] ; CHECK: ext: ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll @@ -15,8 +15,8 @@ ; CHECK-NEXT: [[TMP1:%.*]] = sext <2 x i16> [[TMP0]] to <2 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = sub nsw <2 x i32> , [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = sub <2 x i32> [[TMP2]], undef -; CHECK-NEXT: [[SHUFFLE10:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[SHUFFLE10]], +; CHECK-NEXT: [[SHUFFLE10:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[SHUFFLE10]], ; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP4]]) ; CHECK-NEXT: [[T19:%.*]] = select i1 undef, i32 [[TMP5]], i32 undef ; CHECK-NEXT: [[T20:%.*]] = icmp sgt i32 [[T19]], 63 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll @@ -130,8 +130,8 @@ ; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 2 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[G10]] to <4 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 3 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[ARRAYIDX2]] to <4 x i32>* ; CHECK-NEXT: store <4 x i32> [[SHUFFLE]], <4 x i32>* [[TMP2]], align 4 ; CHECK-NEXT: [[ARRAYIDX30:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 4 @@ -139,8 +139,8 @@ ; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 6 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[G20]] to <4 x i32>* ; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: [[ARRAYIDX51:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 7 +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[ARRAYIDX30]] to <4 x i32>* ; CHECK-NEXT: store <4 x i32> [[SHUFFLE1]], <4 x i32>* [[TMP5]], align 4 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll @@ -7,15 +7,15 @@ ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 1 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARR]] to <2 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[A7:%.*]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A8:%.*]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A1:%.*]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A2:%.*]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A3:%.*]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A4:%.*]], i32 5 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A5:%.*]], i32 6 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A6:%.*]], i32 7 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[A1:%.*]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A2:%.*]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A3:%.*]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A4:%.*]], i32 3 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A6:%.*]], i32 5 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.*]], i32 6 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A8:%.*]], i32 7 ; CHECK-NEXT: [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> [[TMP10]]) ; CHECK-NEXT: ret i32 [[TMP11]] @@ -57,15 +57,15 @@ ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[ARR]], i64 3 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARR]] to <4 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[A6:%.*]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A1:%.*]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A4:%.*]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A5:%.*]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A8:%.*]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A2:%.*]], i32 5 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[A1:%.*]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A2:%.*]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A3:%.*]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A4:%.*]], i32 3 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A6:%.*]], i32 5 ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.*]], i32 6 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A3:%.*]], i32 7 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A8:%.*]], i32 7 ; CHECK-NEXT: [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> [[TMP10]]) ; CHECK-NEXT: ret i32 [[TMP11]] @@ -111,15 +111,15 @@ ; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[ARR]], i64 1 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARR]] to <4 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[A4:%.*]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A6:%.*]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A5:%.*]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A8:%.*]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A2:%.*]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A7:%.*]], i32 5 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A1:%.*]], i32 6 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A3:%.*]], i32 7 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[A1:%.*]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A2:%.*]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A3:%.*]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A4:%.*]], i32 3 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A6:%.*]], i32 5 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.*]], i32 6 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A8:%.*]], i32 7 ; CHECK-NEXT: [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> [[TMP10]]) ; CHECK-NEXT: ret i32 [[TMP11]]