diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -581,6 +581,25 @@ return Index; } +/// Reorders the list of scalars in accordance with the given \p Order and then +/// the \p Mask. \p Order - is the original order of the scalars, need to +/// reorder scalars into an unordered state at first according to the given +/// order. Then the ordered scalars are shuffled once again in accordance with +/// the provided mask. +static void reorderScalars(SmallVectorImpl &Scalars, + ArrayRef Order, ArrayRef Mask) { + assert(!Mask.empty() && "Expected non-empty mask."); + SmallVector Prev(Scalars.size()); + Prev.swap(Scalars); + if (Order.empty()) { + for (unsigned I = 0, E = Prev.size(); I < E; ++I) + Scalars[Mask[I]] = Prev[I]; + } else { + for (unsigned I = 0, E = Prev.size(); I < E; ++I) + Scalars[Mask[Order[I]]] = Prev[Order[I]]; + } +} + namespace slpvectorizer { /// Bottom Up SLP Vectorizer. @@ -645,13 +664,12 @@ void buildTree(ArrayRef Roots, ArrayRef UserIgnoreLst = None); - /// Construct a vectorizable tree that starts at \p Roots, ignoring users for - /// the purpose of scheduling and extraction in the \p UserIgnoreLst taking - /// into account (and updating it, if required) list of externally used - /// values stored in \p ExternallyUsedValues. - void buildTree(ArrayRef Roots, - ExtraValueToDebugLocsMap &ExternallyUsedValues, - ArrayRef UserIgnoreLst = None); + /// Builds external uses of the vectorized scalars, i.e. the list of + /// vectorized scalars to be extracted, their lanes and their scalar users. \p + /// ExternallyUsedValues contains additional list of external uses to handle + /// vectorization of reductions. + void + buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {}); /// Clear the internal data structures that are created by 'buildTree'. void deleteTree() { @@ -659,8 +677,6 @@ ScalarToTreeEntry.clear(); MustGather.clear(); ExternalUses.clear(); - NumOpsWantToKeepOrder.clear(); - NumOpsWantToKeepOriginalOrder = 0; for (auto &Iter : BlocksSchedules) { BlockScheduling *BS = Iter.second.get(); BS->clear(); @@ -674,103 +690,25 @@ /// Perform LICM and CSE on the newly generated gather sequences. void optimizeGatherSequence(); - /// \returns The best order of instructions for vectorization. - Optional> bestOrder() const { - assert(llvm::all_of( - NumOpsWantToKeepOrder, - [this](const decltype(NumOpsWantToKeepOrder)::value_type &D) { - return D.getFirst().size() == - VectorizableTree[0]->Scalars.size(); - }) && - "All orders must have the same size as number of instructions in " - "tree node."); - auto I = std::max_element( - NumOpsWantToKeepOrder.begin(), NumOpsWantToKeepOrder.end(), - [](const decltype(NumOpsWantToKeepOrder)::value_type &D1, - const decltype(NumOpsWantToKeepOrder)::value_type &D2) { - return D1.second < D2.second; - }); - if (I == NumOpsWantToKeepOrder.end() || - I->getSecond() <= NumOpsWantToKeepOriginalOrder) - return None; - - return makeArrayRef(I->getFirst()); - } - - /// Builds the correct order for root instructions. - /// If some leaves have the same instructions to be vectorized, we may - /// incorrectly evaluate the best order for the root node (it is built for the - /// vector of instructions without repeated instructions and, thus, has less - /// elements than the root node). This function builds the correct order for - /// the root node. - /// For example, if the root node is \, then the leaves - /// are \ and \. When we try to vectorize the first - /// leaf, it will be shrink to \. If instructions in this leaf should - /// be reordered, the best order will be \<1, 0\>. We need to extend this - /// order for the root node. For the root node this order should look like - /// \<3, 0, 1, 2\>. This function extends the order for the reused - /// instructions. - void findRootOrder(OrdersType &Order) { - // If the leaf has the same number of instructions to vectorize as the root - // - order must be set already. - unsigned RootSize = VectorizableTree[0]->Scalars.size(); - if (Order.size() == RootSize) - return; - SmallVector RealOrder(Order.size()); - std::swap(Order, RealOrder); - SmallVector Mask; - inversePermutation(RealOrder, Mask); - Order.assign(Mask.begin(), Mask.end()); - // The leaf has less number of instructions - need to find the true order of - // the root. - // Scan the nodes starting from the leaf back to the root. - const TreeEntry *PNode = VectorizableTree.back().get(); - SmallVector Nodes(1, PNode); - SmallPtrSet Visited; - while (!Nodes.empty() && Order.size() != RootSize) { - const TreeEntry *PNode = Nodes.pop_back_val(); - if (!Visited.insert(PNode).second) - continue; - const TreeEntry &Node = *PNode; - for (const EdgeInfo &EI : Node.UserTreeIndices) - if (EI.UserTE) - Nodes.push_back(EI.UserTE); - if (Node.ReuseShuffleIndices.empty()) - continue; - // Build the order for the parent node. - OrdersType NewOrder(Node.ReuseShuffleIndices.size(), RootSize); - SmallVector OrderCounter(Order.size(), 0); - // The algorithm of the order extension is: - // 1. Calculate the number of the same instructions for the order. - // 2. Calculate the index of the new order: total number of instructions - // with order less than the order of the current instruction + reuse - // number of the current instruction. - // 3. The new order is just the index of the instruction in the original - // vector of the instructions. - for (unsigned I : Node.ReuseShuffleIndices) - ++OrderCounter[Order[I]]; - SmallVector CurrentCounter(Order.size(), 0); - for (unsigned I = 0, E = Node.ReuseShuffleIndices.size(); I < E; ++I) { - unsigned ReusedIdx = Node.ReuseShuffleIndices[I]; - unsigned OrderIdx = Order[ReusedIdx]; - unsigned NewIdx = 0; - for (unsigned J = 0; J < OrderIdx; ++J) - NewIdx += OrderCounter[J]; - NewIdx += CurrentCounter[OrderIdx]; - ++CurrentCounter[OrderIdx]; - assert(NewOrder[NewIdx] == RootSize && - "The order index should not be written already."); - NewOrder[NewIdx] = I; - } - std::swap(Order, NewOrder); - } - assert(Order.size() == RootSize && - "Root node is expected or the size of the order must be the same as " - "the number of elements in the root node."); - assert(llvm::all_of(Order, - [RootSize](unsigned Val) { return Val != RootSize; }) && - "All indices must be initialized"); - } + /// Reorders the current graph to the most profitable order starting from the + /// root node to the leaf nodes. The best order is chosen only from the nodes + /// of the same size (vectorization factor). Smaller nodes are considered + /// parts of subgraph with smaller VF and they are reordered independently. We + /// can make it because we still need to extend smaller nodes to the wider VF + /// and we can merge reordering shuffles with the widening shuffles. If \p + /// FreeReorder is true, the reordering of the root node is considered free + /// and we don't need to shuffle it to restore its order. + void reorderTopToBottom(bool FreeReorder); + + /// Reorders the current graph to the most profitable order starting from + /// leaves to the root. It allows to rotate small subgraphs and reduce the + /// number of reshuffles if the leaf nodes use the same order. In this case we + /// can merge the orders and just shuffle user node instead of shuffling its + /// operands. Plus, even the leaf nodes have different orders, it allows to + /// sink reordering in the graph closer to the root node and merge it later + /// during analysis. If \p FreeReorder is true, it means that the root node of + /// the graph is free to reorder and no need to restore its original order. + void reorderBottomToTop(bool FreeReorder); /// \return The vector element size in bits to use when vectorizing the /// expression tree ending at \p V. If V is a store, the size is the width of @@ -1701,6 +1639,12 @@ } } + /// Reorders operands of the node to the given mask \p Mask. + void reorderOperands(ArrayRef Mask) { + for (ValueList &Operand : Operands) + reorderScalars(Operand, ReorderIndices, Mask); + } + /// \returns the \p OpIdx operand of this TreeEntry. ValueList &getOperand(unsigned OpIdx) { assert(OpIdx < Operands.size() && "Off bounds"); @@ -2434,14 +2378,6 @@ } }; - /// Contains orders of operations along with the number of bundles that have - /// operations in this order. It stores only those orders that require - /// reordering, if reordering is not required it is counted using \a - /// NumOpsWantToKeepOriginalOrder. - DenseMap NumOpsWantToKeepOrder; - /// Number of bundles that do not require reordering. - unsigned NumOpsWantToKeepOriginalOrder = 0; - // Analysis and block reference. Function *F; ScalarEvolution *SE; @@ -2594,21 +2530,379 @@ }; } -void BoUpSLP::buildTree(ArrayRef Roots, - ArrayRef UserIgnoreLst) { - ExtraValueToDebugLocsMap ExternallyUsedValues; - buildTree(Roots, ExternallyUsedValues, UserIgnoreLst); +/// Reorders the given \p Order according to the given \p Mask. \p Order - is +/// the original order of the scalars. Procedure transforms the provided order +/// in accordance with the given \p Mask. +static void reorderOrder(SmallVectorImpl &Order, ArrayRef Mask) { + assert(!Mask.empty() && "Expected non-empty mask."); + if (Order.empty()) { + Order.resize(Mask.size()); + std::iota(Order.begin(), Order.end(), 0); + } + bool IsIdentity = true; + SmallVector Prev(Order.size()); + Prev.swap(Order); + for (unsigned I = 0, E = Prev.size(); I < E; ++I) { + Order[Mask[Prev[I]]] = I; + IsIdentity &= I == Order[I]; + } + if (IsIdentity) + Order.clear(); } -void BoUpSLP::buildTree(ArrayRef Roots, - ExtraValueToDebugLocsMap &ExternallyUsedValues, - ArrayRef UserIgnoreLst) { - deleteTree(); - UserIgnoreList = UserIgnoreLst; - if (!allSameType(Roots)) - return; - buildTree_rec(Roots, 0, EdgeInfo()); +/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses +/// contains original mask for the scalars reused in the node. Procedure +/// transform this mask in accordance with the given \p Mask. +static void reorderReuses(SmallVectorImpl &Reuses, ArrayRef Mask) { + assert(!Reuses.empty() && !Mask.empty() && + "Expected non-empty mask and reuses mask."); + SmallVector Prev(Reuses.size()); + Prev.swap(Reuses); + for (unsigned I = 0, E = Prev.size(); I < E; ++I) + Reuses[Mask[I]] = Prev[I]; +} +void BoUpSLP::reorderTopToBottom(bool FreeReorder) { + // Maps VF to the graph nodes. + DenseMap> VFToOrderedEntries; + // ExtractElement gather nodes which can be vectorized and need to handle + // their ordering. + DenseMap GathersToOrders; + // Find all reorderable nodes with the given VF. + // Currently the are vectorized loads,extracts + some gathering of extracts. + for_each(VectorizableTree, [this, &VFToOrderedEntries, &GathersToOrders]( + const std::unique_ptr &TE) { + if (TE->State == TreeEntry::Vectorize && + isa(TE->getMainOp())) { + VFToOrderedEntries[TE->Scalars.size()].insert(TE.get()); + } else if (TE->State == TreeEntry::NeedToGather && + TE->getOpcode() == Instruction::ExtractElement && + isa(cast(TE->getMainOp()) + ->getVectorOperandType()) && + allSameType(TE->Scalars) && allSameBlock(TE->Scalars)) { + // Check that gather of extractelements can be represented as + // just a shuffle of a single vector. + OrdersType CurrentOrder; + bool Reuse = canReuseExtract(TE->Scalars, TE->getMainOp(), CurrentOrder); + if (Reuse || !CurrentOrder.empty()) { + VFToOrderedEntries[TE->Scalars.size()].insert(TE.get()); + GathersToOrders.try_emplace(TE.get(), CurrentOrder); + } + } + }); + + // Reorder the graph nodes according to their vectorization factor. + for (unsigned VF = VectorizableTree.front()->Scalars.size(); VF > 1; + VF /= 2) { + auto It = VFToOrderedEntries.find(VF); + if (It == VFToOrderedEntries.end()) + continue; + // Try to find the most profitable order. We just are looking for the most + // used order and reorder scalar elements in the nodes according to this + // mostly used order. + const SmallPtrSetImpl &OrderedEntries = It->getSecond(); + // All operands are reordered and used only in this node - propagate the + // most used order to the user node. + DenseMap OrdersUses; + SmallPtrSet VisitedOps; + for (const TreeEntry *OpTE : OrderedEntries) { + // No need to reorder this nodes, still need to extend and to use shuffle, + // just need to merge reordering shuffle and the reuse shuffle. + if (!OpTE->ReuseShuffleIndices.empty()) + continue; + // Count number of orders uses. + const auto &Order = [OpTE, &GathersToOrders]() -> const OrdersType & { + if (OpTE->State == TreeEntry::NeedToGather) + return GathersToOrders.find(OpTE)->second; + return OpTE->ReorderIndices; + }(); + ++OrdersUses.try_emplace(Order).first->getSecond(); + } + // Set order of the user node. + if (OrdersUses.empty()) + continue; + // If need to reorder the root node, it means it also requires to keep its + // original order. + if (VF == VectorizableTree.front()->Scalars.size() && !FreeReorder) + ++OrdersUses[VectorizableTree.front()->ReorderIndices]; + // Choose the most used order. + ArrayRef BestOrder; + unsigned Cnt; + std::tie(BestOrder, Cnt) = *OrdersUses.begin(); + for (const auto &Pair : llvm::drop_begin(OrdersUses)) { + if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) + std::tie(BestOrder, Cnt) = Pair; + } + // Set order of the user node. + if (BestOrder.empty()) + continue; + SmallVector Mask; + inversePermutation(BestOrder, Mask); + SmallPtrSet SmallOperandsToReorder; + // Do an actual reordering, if profitable. + for (std::unique_ptr &TE : VectorizableTree) { + // Just do the reordering for the nodes with the given VF. + if (TE->Scalars.size() != VF) { + if (TE->ReuseShuffleIndices.size() == VF) { + // Need to reorder the reuses masks of the operands with smaller VF to + // be able to find the math between the graph nodes and scalar + // operands of the given node during vectorization/cost estimation. + // Build a list of such operands for future reordering. + assert(all_of(TE->UserTreeIndices, + [VF](const EdgeInfo &EI) { + return EI.UserTE->Scalars.size() == VF; + }) && + "All users must be of VF size."); + SmallOperandsToReorder.insert(TE.get()); + } + continue; + } + // Reorder the node and its operands. + TE->updateStateIfReorder(); + TE->reorderOperands(Mask); + if (TE->ReuseShuffleIndices.empty()) { + reorderScalars(TE->Scalars, TE->ReorderIndices, Mask); + if (TE->State == TreeEntry::Vectorize && + (TE.get() != VectorizableTree.front().get() || !FreeReorder) && + isa( + TE->getMainOp())) { + // Build correct orders for extract{element,value}, loads and stores. + reorderOrder(TE->ReorderIndices, Mask); + // For stores the order is actually a mask. + if (isa(TE->getMainOp()) && !TE->ReorderIndices.empty()) { + SmallVector StoreOrder; + inversePermutation(TE->ReorderIndices, StoreOrder); + copy(StoreOrder, TE->ReorderIndices.begin()); + } + } else { + TE->ReorderIndices.clear(); + } + } else { + // Build correct order for nodes with reused shuffles. + reorderOrder(TE->ReorderIndices, Mask); + } + } + // Update ordering of the operands with the smaller VF than the given one. + for (TreeEntry *TE : SmallOperandsToReorder) + reorderReuses(TE->ReuseShuffleIndices, Mask); + } +} + +void BoUpSLP::reorderBottomToTop(bool FreeReorder) { + SetVector OrderedEntries; + DenseMap GathersToOrders; + // Find all reorderable nodes with the given VF. + // Currently the are vectorized loads,extracts without alternate operands + + // some gathering of extracts. + SmallVector NonVectorized; + for_each(VectorizableTree, [this, &OrderedEntries, &GathersToOrders, + &NonVectorized]( + const std::unique_ptr &TE) { + if (TE->State == TreeEntry::Vectorize && + isa(TE->getMainOp())) { + OrderedEntries.insert(TE.get()); + } else if (TE->State == TreeEntry::NeedToGather && + TE->getOpcode() == Instruction::ExtractElement && + isa(cast(TE->getMainOp()) + ->getVectorOperandType()) && + allSameType(TE->Scalars) && allSameBlock(TE->Scalars)) { + // Check that gather of extractelements can be represented as + // just a shuffle of a single vector with a single user only. + OrdersType CurrentOrder; + bool Reuse = canReuseExtract(TE->Scalars, TE->getMainOp(), CurrentOrder); + if ((Reuse || !CurrentOrder.empty()) && + !any_of( + VectorizableTree, [&TE](const std::unique_ptr &Entry) { + return Entry->State == TreeEntry::NeedToGather && + Entry.get() != TE.get() && Entry->isSame(TE->Scalars); + })) { + OrderedEntries.insert(TE.get()); + GathersToOrders.try_emplace(TE.get(), CurrentOrder); + } + } + if (TE->State != TreeEntry::Vectorize) + NonVectorized.push_back(TE.get()); + }); + + // Checks if the operands of the users are reordarable and have only single + // use. + auto &&CheckOperands = + [this, &NonVectorized](const auto &Data, + SmallVectorImpl &GatherOps) { + for (unsigned I = 0, E = Data.first->getNumOperands(); I < E; ++I) { + if (any_of(Data.second, + [I](const std::pair &OpData) { + return OpData.first == I; + })) + continue; + ArrayRef VL = Data.first->getOperand(I); + const TreeEntry *TE = nullptr; + const auto *It = find_if(VL, [this, &TE](Value *V) { + TE = getTreeEntry(V); + return TE; + }); + if (It != VL.end() && TE->isSame(VL)) + return false; + TreeEntry *Gather = nullptr; + if (count_if(NonVectorized, [VL, &Gather](TreeEntry *TE) { + assert(TE->State != TreeEntry::Vectorize && + "Only non-vectorized nodes are expected."); + if (TE->isSame(VL)) { + Gather = TE; + return true; + } + return false; + }) != 1) + return false; + GatherOps.push_back(Gather); + } + return true; + }; + // 1. Propagate order to the graph nodes, which use only reordered nodes. + // I.e., if the node has operands, that are reordered, try to make at least + // one operand order in the natural order and reorder others + reorder the + // user node itself. + SmallPtrSet Visited; + while (!OrderedEntries.empty()) { + // 1. Filter out only reordered nodes. + // 2. If the entry has multiple uses - skip it and jump to the next node. + MapVector>> Users; + SmallVector Filtered; + for (TreeEntry *TE : OrderedEntries) { + if (!(TE->State == TreeEntry::Vectorize || + (TE->State == TreeEntry::NeedToGather && + TE->getOpcode() == Instruction::ExtractElement)) || + TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() || + !all_of(drop_begin(TE->UserTreeIndices), + [TE](const EdgeInfo &EI) { + return EI.UserTE == TE->UserTreeIndices.front().UserTE; + }) || + !Visited.insert(TE).second) { + Filtered.push_back(TE); + continue; + } + // Build a map between user nodes and their operands order to speedup + // search. The graph currently does not provide this dependency directly. + for (EdgeInfo &EI : TE->UserTreeIndices) { + TreeEntry *UserTE = EI.UserTE; + auto It = Users.find(UserTE); + if (It == Users.end()) + It = Users.insert({UserTE, {}}).first; + It->second.emplace_back(EI.EdgeIdx, TE); + } + } + // Erase filtered entries. + for_each(Filtered, + [&OrderedEntries](TreeEntry *TE) { OrderedEntries.remove(TE); }); + for (const auto &Data : Users) { + // Check that operands are used only in the User node. + SmallVector GatherOps; + if (!CheckOperands(Data, GatherOps)) { + for_each(Data.second, + [&OrderedEntries](const std::pair &Op) { + OrderedEntries.remove(Op.second); + }); + continue; + } + // All operands are reordered and used only in this node - propagate the + // most used order to the user node. + DenseMap OrdersUses; + SmallPtrSet VisitedOps; + for (const auto &Op : Data.second) { + TreeEntry *OpTE = Op.second; + if (!OpTE->ReuseShuffleIndices.empty()) + continue; + const auto &Order = [OpTE, &GathersToOrders]() -> const OrdersType & { + if (OpTE->State == TreeEntry::NeedToGather) + return GathersToOrders.find(OpTE)->second; + return OpTE->ReorderIndices; + }(); + ++OrdersUses.try_emplace(Order).first->getSecond(); + if (VisitedOps.insert(OpTE).second) + OrdersUses.try_emplace({}, 0).first->getSecond() += + OpTE->UserTreeIndices.size(); + --OrdersUses[{}]; + } + // If no orders - skip current nodes and jump to the next one, if any. + if (OrdersUses.empty()) { + for_each(Data.second, + [&OrderedEntries](const std::pair &Op) { + OrderedEntries.remove(Op.second); + }); + continue; + } + // Choose the best order. + ArrayRef BestOrder; + unsigned Cnt; + std::tie(BestOrder, Cnt) = *OrdersUses.begin(); + for (const auto &Pair : llvm::drop_begin(OrdersUses)) { + if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) + std::tie(BestOrder, Cnt) = Pair; + } + // Set order of the user node (reordering of operands and user nodes). + if (BestOrder.empty()) { + for_each(Data.second, + [&OrderedEntries](const std::pair &Op) { + OrderedEntries.remove(Op.second); + }); + continue; + } + // Erase operands from OrderedEntries list and adjust their orders. + VisitedOps.clear(); + SmallVector Mask; + inversePermutation(BestOrder, Mask); + for (const std::pair &Op : Data.second) { + TreeEntry *TE = Op.second; + OrderedEntries.remove(TE); + if (!VisitedOps.insert(TE).second) + continue; + if (!TE->ReuseShuffleIndices.empty()) { + // Just reorder reuses indices. + reorderReuses(TE->ReuseShuffleIndices, Mask); + continue; + } + // Gathers are processed separately. + if (TE->State != TreeEntry::Vectorize) + continue; + assert((BestOrder.size() == TE->ReorderIndices.size() || + TE->ReorderIndices.empty()) && + "Non-matching sizes of user/operand entries."); + TE->updateStateIfReorder(); + reorderScalars(TE->Scalars, TE->ReorderIndices, Mask); + reorderOrder(TE->ReorderIndices, Mask); + } + // For gathers just need to reorder its scalars. + for (TreeEntry *Gather : GatherOps) { + if (!Gather->ReuseShuffleIndices.empty()) + continue; + reorderScalars(Gather->Scalars, None, Mask); + OrderedEntries.remove(Gather); + } + // Reorder operands of the user node and set the ordering for the user + // node itself. + Data.first->updateStateIfReorder(); + Data.first->reorderOperands(Mask); + if (!FreeReorder || Data.first != VectorizableTree.front().get()) { + reorderOrder(Data.first->ReorderIndices, Mask); + // For stores the order is actually a mask. + if (isa(Data.first->getMainOp()) && + !Data.first->ReorderIndices.empty()) { + SmallVector StoreOrder; + inversePermutation(Data.first->ReorderIndices, StoreOrder); + copy(StoreOrder, Data.first->ReorderIndices.begin()); + } + // Insert user node to the list to try to sink reordering deeper in the + // graph. + OrderedEntries.insert(Data.first); + } else { + reorderScalars(Data.first->Scalars, Data.first->ReorderIndices, Mask); + } + } + } +} + +void BoUpSLP::buildExternalUses( + const ExtraValueToDebugLocsMap &ExternallyUsedValues) { // Collect the values that we need to extract from the tree. for (auto &TEPtr : VectorizableTree) { TreeEntry *Entry = TEPtr.get(); @@ -2664,6 +2958,73 @@ } } +void BoUpSLP::buildTree(ArrayRef Roots, + ArrayRef UserIgnoreLst) { + deleteTree(); + UserIgnoreList = UserIgnoreLst; + if (!allSameType(Roots)) + return; + buildTree_rec(Roots, 0, EdgeInfo()); +} + +namespace { +/// Tracks the state we can represent the loads in the given sequence. +enum class LoadsState { Gather, Vectorize, ScatterVectorize }; +} // anonymous namespace + +/// Checks if the given array of loads can be represented as a vectorized, +/// scatter or just simple gather. +static LoadsState canVectorizeLoads(ArrayRef VL, const Value *VL0, + const DataLayout &DL, ScalarEvolution &SE, + SmallVectorImpl &Order, + SmallVectorImpl &PointerOps) { + // Check that a vectorized load would load the same memory as a scalar + // load. For example, we don't want to vectorize loads that are smaller + // than 8-bit. Even though we have a packed struct {} LLVM + // treats loading/storing it as an i8 struct. If we vectorize loads/stores + // from such a struct, we read/write packed bits disagreeing with the + // unvectorized version. + Type *ScalarTy = VL0->getType(); + + if (DL.getTypeSizeInBits(ScalarTy) != DL.getTypeAllocSizeInBits(ScalarTy)) + return LoadsState::Gather; + + // Make sure all loads in the bundle are simple - we can't vectorize + // atomic or volatile loads. + PointerOps.clear(); + PointerOps.resize(VL.size()); + auto *POIter = PointerOps.begin(); + for (Value *V : VL) { + auto *L = cast(V); + if (!L->isSimple()) + return LoadsState::Gather; + *POIter = L->getPointerOperand(); + ++POIter; + } + + Order.clear(); + // Check the order of pointer operands. + if (llvm::sortPtrAccesses(PointerOps, ScalarTy, DL, SE, Order)) { + Value *Ptr0; + Value *PtrN; + if (Order.empty()) { + Ptr0 = PointerOps.front(); + PtrN = PointerOps.back(); + } else { + Ptr0 = PointerOps[Order.front()]; + PtrN = PointerOps[Order.back()]; + } + Optional Diff = + getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE); + // Check that the sorted loads are consecutive. + if (static_cast(*Diff) == VL.size() - 1) + return LoadsState::Vectorize; + return LoadsState::ScatterVectorize; + } + + return LoadsState::Gather; +} + void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, const EdgeInfo &UserTreeIdx) { assert((allConstant(VL) || allSameType(VL)) && "Invalid types!"); @@ -2867,7 +3228,6 @@ bool Reuse = canReuseExtract(VL, VL0, CurrentOrder); if (Reuse) { LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n"); - ++NumOpsWantToKeepOriginalOrder; newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); // This is a special case, as it does not gather, but at the same time @@ -2889,8 +3249,6 @@ // otherwise return the iterator to the existing one. newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies, CurrentOrder); - findRootOrder(CurrentOrder); - ++NumOpsWantToKeepOrder[CurrentOrder]; // This is a special case, as it does not gather, but at the same time // we are not extending buildTree_rec() towards the operands. ValueList Op0; @@ -2946,82 +3304,50 @@ // treats loading/storing it as an i8 struct. If we vectorize loads/stores // from such a struct, we read/write packed bits disagreeing with the // unvectorized version. - Type *ScalarTy = VL0->getType(); - - if (DL->getTypeSizeInBits(ScalarTy) != - DL->getTypeAllocSizeInBits(ScalarTy)) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); - LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n"); - return; - } - - // Make sure all loads in the bundle are simple - we can't vectorize - // atomic or volatile loads. - SmallVector PointerOps(VL.size()); - auto POIter = PointerOps.begin(); - for (Value *V : VL) { - auto *L = cast(V); - if (!L->isSimple()) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); - LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); - return; - } - *POIter = L->getPointerOperand(); - ++POIter; - } - + SmallVector PointerOps; OrdersType CurrentOrder; - // Check the order of pointer operands. - if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) { - Value *Ptr0; - Value *PtrN; + TreeEntry *TE = nullptr; + switch (canVectorizeLoads(VL, VL0, *DL, *SE, CurrentOrder, PointerOps)) { + case LoadsState::Vectorize: if (CurrentOrder.empty()) { - Ptr0 = PointerOps.front(); - PtrN = PointerOps.back(); + // Original loads are consecutive and does not require reordering. + TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); + LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n"); } else { - Ptr0 = PointerOps[CurrentOrder.front()]; - PtrN = PointerOps[CurrentOrder.back()]; - } - Optional Diff = getPointersDiff( - ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE); - // Check that the sorted loads are consecutive. - if (static_cast(*Diff) == VL.size() - 1) { - if (CurrentOrder.empty()) { - // Original loads are consecutive and does not require reordering. - ++NumOpsWantToKeepOriginalOrder; - TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, - UserTreeIdx, ReuseShuffleIndicies); - TE->setOperandsInOrder(); - LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n"); - } else { - // Need to reorder. - TreeEntry *TE = - newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies, CurrentOrder); - TE->setOperandsInOrder(); - LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n"); - findRootOrder(CurrentOrder); - ++NumOpsWantToKeepOrder[CurrentOrder]; - } - return; + // Need to reorder. + TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies, CurrentOrder); + LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n"); } + TE->setOperandsInOrder(); + break; + case LoadsState::ScatterVectorize: // Vectorizing non-consecutive loads with `llvm.masked.gather`. - TreeEntry *TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S, - UserTreeIdx, ReuseShuffleIndicies); + TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S, + UserTreeIdx, ReuseShuffleIndicies); TE->setOperandsInOrder(); buildTree_rec(PointerOps, Depth + 1, {TE, 0}); LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n"); - return; + break; + case LoadsState::Gather: + BS.cancelScheduling(VL, VL0); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); +#ifndef NDEBUG + Type *ScalarTy = VL0->getType(); + if (DL->getTypeSizeInBits(ScalarTy) != + DL->getTypeAllocSizeInBits(ScalarTy)) + LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n"); + else if (any_of(VL, [](Value *V) { + return !cast(V)->isSimple(); + })) + LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); + else + LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); +#endif // NDEBUG + break; } - - LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); return; } case Instruction::ZExt: @@ -3268,7 +3594,6 @@ if (static_cast(*Dist) == VL.size() - 1) { if (CurrentOrder.empty()) { // Original stores are consecutive and does not require reordering. - ++NumOpsWantToKeepOriginalOrder; TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); TE->setOperandsInOrder(); @@ -3281,8 +3606,6 @@ TE->setOperandsInOrder(); buildTree_rec(Operands, Depth + 1, {TE, 0}); LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n"); - findRootOrder(CurrentOrder); - ++NumOpsWantToKeepOrder[CurrentOrder]; } return; } @@ -3613,6 +3936,27 @@ return Cost; } +/// Shuffles \p Mask in accordance with the given \p SubMask. +static void addMask(SmallVectorImpl &Mask, ArrayRef SubMask) { + if (SubMask.empty()) + return; + if (Mask.empty()) { + Mask.append(SubMask.begin(), SubMask.end()); + return; + } + SmallVector NewMask(SubMask.size(), SubMask.size()); + int TermValue = std::min(Mask.size(), SubMask.size()); + for (int I = 0, E = SubMask.size(); I < E; ++I) { + if (SubMask[I] >= TermValue || SubMask[I] == UndefMaskElem || + Mask[SubMask[I]] >= TermValue) { + NewMask[I] = UndefMaskElem; + continue; + } + NewMask[I] = Mask[SubMask[I]]; + } + Mask.swap(NewMask); +} + InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals) { ArrayRef VL = E->Scalars; @@ -3635,12 +3979,6 @@ unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size(); bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); - InstructionCost ReuseShuffleCost = 0; - if (NeedToShuffleReuses) { - ReuseShuffleCost = - TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy, - E->ReuseShuffleIndices); - } // FIXME: it tries to fix a problem with MSVC buildbots. TargetTransformInfo &TTIRef = *TTI; auto &&AdjustExtractsCost = [this, &TTIRef, CostKind, VL, VecTy, @@ -3729,6 +4067,12 @@ dbgs() << "SLP: perfect diamond match for gather bundle that starts with " << *VL.front() << ".\n"); + // FIXME: need to use FixedVectorType::get(VecTy->getElementType(), + // E->ReuseShuffleIndices.size()) instead of VecTy here. + if (NeedToShuffleReuses) + GatherCost = + TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, + VecTy, E->ReuseShuffleIndices); } else { LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size() << " entries for bundle that starts with " @@ -3736,16 +4080,17 @@ // Detected that instead of gather we can emit a shuffle of single/two // previously vectorized nodes. Add the cost of the permutation rather // than gather. + ::addMask(Mask, E->ReuseShuffleIndices); + // FIXME: need to use FixedVectorType::get(VecTy->getElementType(), + // E->ReuseShuffleIndices.size()) instead of VecTy here. GatherCost = TTI->getShuffleCost(*Shuffle, VecTy, Mask); } - return ReuseShuffleCost + GatherCost; + return GatherCost; } if (isSplat(VL)) { // Found the broadcasting of the single scalar, calculate the cost as the // broadcast. - return ReuseShuffleCost + - TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, None, - 0); + return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy); } if (E->getOpcode() == Instruction::ExtractElement && allSameType(VL) && allSameBlock(VL) && @@ -3763,11 +4108,122 @@ InstructionCost Cost = computeExtractCost(VL, VecTy, *ShuffleKind, Mask, *TTI); AdjustExtractsCost(Cost, /*IsGather=*/true); - return ReuseShuffleCost + Cost; + if (NeedToShuffleReuses) + // FIXME: need to use FixedVectorType::get(VecTy->getElementType(), + // E->ReuseShuffleIndices.size()) instead of VecTy here. + Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, + VecTy, E->ReuseShuffleIndices); + return Cost; + } + } + InstructionCost ReuseShuffleCost = 0; + if (NeedToShuffleReuses) + // FIXME: need to use FixedVectorType::get(VecTy->getElementType(), + // E->ReuseShuffleIndices.size()) instead of VecTy here. + ReuseShuffleCost = TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, VecTy, + E->ReuseShuffleIndices); + // Improve gather cost for gather of loads, if we can group some of the + // loads into vector loads. + if (VL.size() > 2 && E->getOpcode() == Instruction::Load && + !E->isAltShuffle()) { + BoUpSLP::ValueSet VectorizedLoads; + unsigned StartIdx = 0; + unsigned VF = VL.size() / 2; + unsigned VectorizedCnt = 0; + unsigned ScatterVectorizeCnt = 0; + const unsigned Sz = DL->getTypeSizeInBits(E->getMainOp()->getType()); + for (unsigned MinVF = getMinVecRegSize() / (2 * Sz); VF >= MinVF; + VF /= 2) { + for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End; + Cnt += VF) { + ArrayRef Slice = VL.slice(Cnt, VF); + if (!VectorizedLoads.count(Slice.front()) && + !VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) { + SmallVector PointerOps; + OrdersType CurrentOrder; + LoadsState LS = canVectorizeLoads(Slice, Slice.front(), *DL, *SE, + CurrentOrder, PointerOps); + switch (LS) { + case LoadsState::Vectorize: + case LoadsState::ScatterVectorize: + // Mark the vectorized loads so that we don't vectorize them + // again. + if (LS == LoadsState::Vectorize) + ++VectorizedCnt; + else + ++ScatterVectorizeCnt; + VectorizedLoads.insert(Slice.begin(), Slice.end()); + // If we vectorized initial block, no need to try to vectorize it + // again. + if (Cnt == StartIdx) + StartIdx += VF; + break; + case LoadsState::Gather: + break; + } + } + } + // Check if the whole array was vectorized already - exit. + if (StartIdx >= VL.size()) + break; + // Found vectorizable parts - exit. + if (!VectorizedLoads.empty()) + break; + } + if (!VectorizedLoads.empty()) { + InstructionCost GatherCost = 0; + // Get the cost for gathered loads. + for (unsigned I = 0, End = VL.size(); I < End; I += VF) { + if (VectorizedLoads.contains(VL[I])) + continue; + GatherCost += getGatherCost(VL.slice(I, VF)); + } + // The cost for vectorized loads. + InstructionCost ScalarsCost = 0; + for (Value *V : VectorizedLoads) { + auto *LI = cast(V); + ScalarsCost += TTI->getMemoryOpCost( + Instruction::Load, LI->getType(), LI->getAlign(), + LI->getPointerAddressSpace(), CostKind, LI); + } + auto *LI = cast(E->getMainOp()); + auto *LoadTy = FixedVectorType::get(LI->getType(), VF); + Align Alignment = LI->getAlign(); + GatherCost += + VectorizedCnt * + TTI->getMemoryOpCost(Instruction::Load, LoadTy, Alignment, + LI->getPointerAddressSpace(), CostKind, LI); + GatherCost += ScatterVectorizeCnt * + TTI->getGatherScatterOpCost( + Instruction::Load, LoadTy, LI->getPointerOperand(), + /*VariableMask=*/false, Alignment, CostKind, LI); + // Add the cost for the subvectors shuffling. + for (unsigned I = VF, End = VL.size(); I < End; I += VF) + GatherCost += TTI->getShuffleCost(TTI::SK_Select, VecTy); + return ReuseShuffleCost + GatherCost - ScalarsCost; } } return ReuseShuffleCost + getGatherCost(VL); } + InstructionCost CommonCost = 0; + SmallVector Mask; + if (!E->ReorderIndices.empty()) { + SmallVector NewMask; + if (E->getOpcode() == Instruction::Store) { + // For stores the order is actually a mask. + NewMask.resize(E->ReorderIndices.size()); + copy(E->ReorderIndices, NewMask.begin()); + } else { + inversePermutation(E->ReorderIndices, NewMask); + } + ::addMask(Mask, NewMask); + } + if (NeedToShuffleReuses) + ::addMask(Mask, E->ReuseShuffleIndices); + if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask)) + // FIXME: need to use FixedVectorType::get(VecTy->getElementType(), + // E->ReuseShuffleIndices.size()) instead of VecTy here. + CommonCost = TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, VecTy, Mask); assert((E->State == TreeEntry::Vectorize || E->State == TreeEntry::ScatterVectorize) && "Unhandled state"); @@ -3783,18 +4239,17 @@ case Instruction::ExtractElement: { // The common cost of removal ExtractElement/ExtractValue instructions + // the cost of shuffles, if required to resuffle the original vector. - InstructionCost CommonCost = 0; if (NeedToShuffleReuses) { unsigned Idx = 0; for (unsigned I : E->ReuseShuffleIndices) { if (ShuffleOrOp == Instruction::ExtractElement) { auto *EE = cast(VL[I]); - ReuseShuffleCost -= TTI->getVectorInstrCost( - Instruction::ExtractElement, EE->getVectorOperandType(), - *getExtractIndex(EE)); + CommonCost -= TTI->getVectorInstrCost(Instruction::ExtractElement, + EE->getVectorOperandType(), + *getExtractIndex(EE)); } else { - ReuseShuffleCost -= TTI->getVectorInstrCost( - Instruction::ExtractElement, VecTy, Idx); + CommonCost -= TTI->getVectorInstrCost(Instruction::ExtractElement, + VecTy, Idx); ++Idx; } } @@ -3802,21 +4257,15 @@ for (Value *V : VL) { if (ShuffleOrOp == Instruction::ExtractElement) { auto *EE = cast(V); - ReuseShuffleCost += TTI->getVectorInstrCost( - Instruction::ExtractElement, EE->getVectorOperandType(), - *getExtractIndex(EE)); + CommonCost += TTI->getVectorInstrCost(Instruction::ExtractElement, + EE->getVectorOperandType(), + *getExtractIndex(EE)); } else { --Idx; - ReuseShuffleCost += TTI->getVectorInstrCost( - Instruction::ExtractElement, VecTy, Idx); + CommonCost += TTI->getVectorInstrCost(Instruction::ExtractElement, + VecTy, Idx); } } - CommonCost = ReuseShuffleCost; - } else if (!E->ReorderIndices.empty()) { - SmallVector NewMask; - inversePermutation(E->ReorderIndices, NewMask); - CommonCost = TTI->getShuffleCost( - TargetTransformInfo::SK_PermuteSingleSrc, VecTy, NewMask); } if (ShuffleOrOp == Instruction::ExtractValue) { for (unsigned I = 0, E = VL.size(); I < E; ++I) { @@ -3847,6 +4296,10 @@ return CommonCost; } case Instruction::InsertElement: { + assert(E->ReuseShuffleIndices.empty() && + "Unique insertelements only are expected."); + assert(E->ReorderIndices.empty() && + "No reordering expected for insertelements."); auto *SrcVecTy = cast(VL0->getType()); unsigned const NumElts = SrcVecTy->getNumElements(); @@ -3905,7 +4358,7 @@ TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy, TTI::getCastContextHint(VL0), CostKind, VL0); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } // Calculate the cost of this instruction. @@ -3915,12 +4368,11 @@ InstructionCost VecCost = 0; // Check if the values are candidates to demote. if (!MinBWs.count(VL0) || VecTy != SrcVecTy) { - VecCost = - ReuseShuffleCost + - TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy, - TTI::getCastContextHint(VL0), CostKind, VL0); + VecCost = CommonCost + TTI->getCastInstrCost( + E->getOpcode(), VecTy, SrcVecTy, + TTI::getCastContextHint(VL0), CostKind, VL0); } - LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost)); + LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost)); return VecCost - ScalarCost; } case Instruction::FCmp: @@ -3931,7 +4383,7 @@ TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, Builder.getInt1Ty(), CmpInst::BAD_ICMP_PREDICATE, CostKind, VL0); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size()); InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost; @@ -3972,8 +4424,8 @@ CmpInst::BAD_ICMP_PREDICATE, CostKind); VecCost = std::min(VecCost, IntrinsicCost); } - LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost)); - return ReuseShuffleCost + VecCost - ScalarCost; + LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost)); + return CommonCost + VecCost - ScalarCost; } case Instruction::FNeg: case Instruction::Add: @@ -4036,14 +4488,14 @@ TTI->getArithmeticInstrCost(E->getOpcode(), ScalarTy, CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, VL0); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost; InstructionCost VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, VL0); - LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost)); - return ReuseShuffleCost + VecCost - ScalarCost; + LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost)); + return CommonCost + VecCost - ScalarCost; } case Instruction::GetElementPtr: { TargetTransformInfo::OperandValueKind Op1VK = @@ -4053,14 +4505,11 @@ InstructionCost ScalarEltCost = TTI->getArithmeticInstrCost( Instruction::Add, ScalarTy, CostKind, Op1VK, Op2VK); - if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; - } InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost; InstructionCost VecCost = TTI->getArithmeticInstrCost( Instruction::Add, VecTy, CostKind, Op1VK, Op2VK); - LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost)); - return ReuseShuffleCost + VecCost - ScalarCost; + LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost)); + return CommonCost + VecCost - ScalarCost; } case Instruction::Load: { // Cost of wide load - cost of scalar loads. @@ -4068,7 +4517,7 @@ InstructionCost ScalarEltCost = TTI->getMemoryOpCost( Instruction::Load, ScalarTy, alignment, 0, CostKind, VL0); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } InstructionCost ScalarLdCost = VecTy->getNumElements() * ScalarEltCost; InstructionCost VecLdCost; @@ -4081,14 +4530,8 @@ Instruction::Load, VecTy, cast(VL0)->getPointerOperand(), /*VariableMask=*/false, alignment, CostKind, VL0); } - if (!NeedToShuffleReuses && !E->ReorderIndices.empty()) { - SmallVector NewMask; - inversePermutation(E->ReorderIndices, NewMask); - VecLdCost += TTI->getShuffleCost( - TargetTransformInfo::SK_PermuteSingleSrc, VecTy, NewMask); - } - LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecLdCost, ScalarLdCost)); - return ReuseShuffleCost + VecLdCost - ScalarLdCost; + LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecLdCost, ScalarLdCost)); + return CommonCost + VecLdCost - ScalarLdCost; } case Instruction::Store: { // We know that we can merge the stores. Calculate the cost. @@ -4101,14 +4544,8 @@ InstructionCost ScalarStCost = VecTy->getNumElements() * ScalarEltCost; InstructionCost VecStCost = TTI->getMemoryOpCost( Instruction::Store, VecTy, Alignment, 0, CostKind, VL0); - if (IsReorder) { - SmallVector NewMask; - inversePermutation(E->ReorderIndices, NewMask); - VecStCost += TTI->getShuffleCost( - TargetTransformInfo::SK_PermuteSingleSrc, VecTy, NewMask); - } - LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecStCost, ScalarStCost)); - return VecStCost - ScalarStCost; + LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecStCost, ScalarStCost)); + return CommonCost + VecStCost - ScalarStCost; } case Instruction::Call: { CallInst *CI = cast(VL0); @@ -4119,7 +4556,7 @@ InstructionCost ScalarEltCost = TTI->getIntrinsicInstrCost(CostAttrs, CostKind); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } InstructionCost ScalarCallCost = VecTy->getNumElements() * ScalarEltCost; @@ -4131,7 +4568,7 @@ << " (" << VecCallCost << "-" << ScalarCallCost << ")" << " for " << *CI << "\n"); - return ReuseShuffleCost + VecCallCost - ScalarCallCost; + return CommonCost + VecCallCost - ScalarCallCost; } case Instruction::ShuffleVector: { assert(E->isAltShuffle() && @@ -4144,11 +4581,11 @@ if (NeedToShuffleReuses) { for (unsigned Idx : E->ReuseShuffleIndices) { Instruction *I = cast(VL[Idx]); - ReuseShuffleCost -= TTI->getInstructionCost(I, CostKind); + CommonCost -= TTI->getInstructionCost(I, CostKind); } for (Value *V : VL) { Instruction *I = cast(V); - ReuseShuffleCost += TTI->getInstructionCost(I, CostKind); + CommonCost += TTI->getInstructionCost(I, CostKind); } } for (Value *V : VL) { @@ -4180,10 +4617,19 @@ assert(E->isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode"); Mask[I] = I + (OpInst->getOpcode() == E->getAltOpcode() ? End : 0); } - VecCost += - TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, Mask, 0); - LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost)); - return ReuseShuffleCost + VecCost - ScalarCost; + if (!E->ReorderIndices.empty()) { + SmallVector NewMask; + inversePermutation(E->ReorderIndices, NewMask); + ::addMask(Mask, NewMask); + } + if (NeedToShuffleReuses) + ::addMask(Mask, E->ReuseShuffleIndices); + // FIXME: need to use FixedVectorType::get(VecTy->getElementType(), + // E->ReuseShuffleIndices.size()) instead of VecTy here. + CommonCost = + TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, Mask); + LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost)); + return CommonCost + VecCost - ScalarCost; } default: llvm_unreachable("Unknown instruction"); @@ -4915,25 +5361,7 @@ addMask(NewMask); } - void addMask(ArrayRef SubMask) { - if (SubMask.empty()) - return; - if (Mask.empty()) { - Mask.append(SubMask.begin(), SubMask.end()); - return; - } - SmallVector NewMask(SubMask.size(), SubMask.size()); - int TermValue = std::min(Mask.size(), SubMask.size()); - for (int I = 0, E = SubMask.size(); I < E; ++I) { - if (SubMask[I] >= TermValue || SubMask[I] == UndefMaskElem || - Mask[SubMask[I]] >= TermValue) { - NewMask[I] = UndefMaskElem; - continue; - } - NewMask[I] = Mask[SubMask[I]]; - } - Mask.swap(NewMask); - } + void addMask(ArrayRef SubMask) { ::addMask(Mask, SubMask); } Value *finalize(Value *V) { IsFinalized = true; @@ -5124,13 +5552,17 @@ auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size()); switch (ShuffleOrOp) { case Instruction::PHI: { + assert( + (E->ReorderIndices.empty() || E != VectorizableTree.front().get()) && + "PHI reordering is free."); auto *PH = cast(VL0); Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI()); Builder.SetCurrentDebugLocation(PH->getDebugLoc()); PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues()); Value *V = NewPhi; - if (NeedToShuffleReuses) - V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); + ShuffleBuilder.addInversedMask(E->ReorderIndices); + ShuffleBuilder.addMask(E->ReuseShuffleIndices); + V = ShuffleBuilder.finalize(V); E->VectorizedValue = V; @@ -5181,6 +5613,7 @@ return NewV; } case Instruction::InsertElement: { + assert(E->ReorderIndices.empty() && "InsertElements reordering is free."); Builder.SetInsertPoint(VL0); Value *V = vectorizeTree(E->getOperand(1)); @@ -5267,6 +5700,7 @@ auto *CI = cast(VL0); Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy); + ShuffleBuilder.addInversedMask(E->ReorderIndices); ShuffleBuilder.addMask(E->ReuseShuffleIndices); V = ShuffleBuilder.finalize(V); @@ -5289,6 +5723,7 @@ CmpInst::Predicate P0 = cast(VL0)->getPredicate(); Value *V = Builder.CreateCmp(P0, L, R); propagateIRFlags(V, E->Scalars, VL0); + ShuffleBuilder.addInversedMask(E->ReorderIndices); ShuffleBuilder.addMask(E->ReuseShuffleIndices); V = ShuffleBuilder.finalize(V); @@ -5309,6 +5744,7 @@ } Value *V = Builder.CreateSelect(Cond, True, False); + ShuffleBuilder.addInversedMask(E->ReorderIndices); ShuffleBuilder.addMask(E->ReuseShuffleIndices); V = ShuffleBuilder.finalize(V); @@ -5332,6 +5768,7 @@ if (auto *I = dyn_cast(V)) V = propagateMetadata(I, E->Scalars); + ShuffleBuilder.addInversedMask(E->ReorderIndices); ShuffleBuilder.addMask(E->ReuseShuffleIndices); V = ShuffleBuilder.finalize(V); @@ -5375,6 +5812,7 @@ if (auto *I = dyn_cast(V)) V = propagateMetadata(I, E->Scalars); + ShuffleBuilder.addInversedMask(E->ReorderIndices); ShuffleBuilder.addMask(E->ReuseShuffleIndices); V = ShuffleBuilder.finalize(V); @@ -5487,6 +5925,7 @@ if (Instruction *I = dyn_cast(V)) V = propagateMetadata(I, E->Scalars); + ShuffleBuilder.addInversedMask(E->ReorderIndices); ShuffleBuilder.addMask(E->ReuseShuffleIndices); V = ShuffleBuilder.finalize(V); @@ -5553,6 +5992,7 @@ ExternalUses.push_back(ExternalUser(ScalarArg, cast(V), 0)); propagateIRFlags(V, E->Scalars, VL0); + ShuffleBuilder.addInversedMask(E->ReorderIndices); ShuffleBuilder.addMask(E->ReuseShuffleIndices); V = ShuffleBuilder.finalize(V); @@ -5600,19 +6040,28 @@ // Also, gather up main and alt scalar ops to propagate IR flags to // each vector operation. ValueList OpScalars, AltScalars; - unsigned e = E->Scalars.size(); - SmallVector Mask(e); - for (unsigned i = 0; i < e; ++i) { - auto *OpInst = cast(E->Scalars[i]); + unsigned Sz = E->Scalars.size(); + SmallVector Mask(Sz); + for (unsigned I = 0; I < Sz; ++I) { + unsigned Idx = I; + if (!E->ReorderIndices.empty()) + Idx = E->ReorderIndices[I]; + auto *OpInst = cast(E->Scalars[Idx]); assert(E->isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode"); if (OpInst->getOpcode() == E->getAltOpcode()) { - Mask[i] = e + i; - AltScalars.push_back(E->Scalars[i]); + Mask[Idx] = Sz + I; + AltScalars.push_back(OpInst); } else { - Mask[i] = i; - OpScalars.push_back(E->Scalars[i]); + Mask[Idx] = I; + OpScalars.push_back(OpInst); } } + if (!E->ReuseShuffleIndices.empty()) { + SmallVector NewMask(E->ReuseShuffleIndices.size()); + transform(E->ReuseShuffleIndices, NewMask.begin(), + [&Mask](int Idx) { return Mask[Idx]; }); + Mask.swap(NewMask); + } propagateIRFlags(V0, OpScalars); propagateIRFlags(V1, AltScalars); @@ -5620,7 +6069,6 @@ Value *V = Builder.CreateShuffleVector(V0, V1, Mask); if (Instruction *I = dyn_cast(V)) V = propagateMetadata(I, E->Scalars); - ShuffleBuilder.addMask(E->ReuseShuffleIndices); V = ShuffleBuilder.finalize(V); E->VectorizedValue = V; @@ -6790,44 +7238,6 @@ return Changed; } -/// Order may have elements assigned special value (size) which is out of -/// bounds. Such indices only appear on places which correspond to undef values -/// (see canReuseExtract for details) and used in order to avoid undef values -/// have effect on operands ordering. -/// The first loop below simply finds all unused indices and then the next loop -/// nest assigns these indices for undef values positions. -/// As an example below Order has two undef positions and they have assigned -/// values 3 and 7 respectively: -/// before: 6 9 5 4 9 2 1 0 -/// after: 6 3 5 4 7 2 1 0 -/// \returns Fixed ordering. -static BoUpSLP::OrdersType fixupOrderingIndices(ArrayRef Order) { - BoUpSLP::OrdersType NewOrder(Order.begin(), Order.end()); - const unsigned Sz = NewOrder.size(); - SmallBitVector UsedIndices(Sz); - SmallVector MaskedIndices; - for (int I = 0, E = NewOrder.size(); I < E; ++I) { - if (NewOrder[I] < Sz) - UsedIndices.set(NewOrder[I]); - else - MaskedIndices.push_back(I); - } - if (MaskedIndices.empty()) - return NewOrder; - SmallVector AvailableIndices(MaskedIndices.size()); - unsigned Cnt = 0; - int Idx = UsedIndices.find_first(); - do { - AvailableIndices[Cnt] = Idx; - Idx = UsedIndices.find_next(Idx); - ++Cnt; - } while (Idx > 0); - assert(Cnt == MaskedIndices.size() && "Non-synced masked/available indices."); - for (int I = 0, E = MaskedIndices.size(); I < E; ++I) - NewOrder[MaskedIndices[I]] = AvailableIndices[I]; - return NewOrder; -} - bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef Chain, BoUpSLP &R, unsigned Idx) { LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size() @@ -6843,19 +7253,13 @@ << "\n"); R.buildTree(Chain); - Optional> Order = R.bestOrder(); - // TODO: Handle orders of size less than number of elements in the vector. - if (Order && Order->size() == Chain.size()) { - // TODO: reorder tree nodes without tree rebuilding. - SmallVector ReorderedOps(Chain.size()); - transform(fixupOrderingIndices(*Order), ReorderedOps.begin(), - [Chain](const unsigned Idx) { return Chain[Idx]; }); - R.buildTree(ReorderedOps); - } if (R.isTreeTinyAndNotFullyVectorizable()) return false; if (R.isLoadCombineCandidate()) return false; + R.reorderTopToBottom(/*FreeReorder=*/false); + R.reorderBottomToTop(/*FreeReorder=*/false); + R.buildExternalUses(); R.computeMinimumValueSizes(); @@ -7142,18 +7546,11 @@ << "\n"); R.buildTree(Ops); - if (AllowReorder) { - Optional> Order = R.bestOrder(); - if (Order) { - // TODO: reorder tree nodes without tree rebuilding. - SmallVector ReorderedOps(Ops.size()); - transform(fixupOrderingIndices(*Order), ReorderedOps.begin(), - [Ops](const unsigned Idx) { return Ops[Idx]; }); - R.buildTree(ReorderedOps); - } - } if (R.isTreeTinyAndNotFullyVectorizable()) continue; + R.reorderTopToBottom(AllowReorder); + R.reorderBottomToTop(AllowReorder); + R.buildExternalUses(); R.computeMinimumValueSizes(); InstructionCost Cost = R.getTreeCost(); @@ -7765,22 +8162,14 @@ unsigned i = 0; while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) { ArrayRef VL(&ReducedVals[i], ReduxWidth); - V.buildTree(VL, ExternallyUsedValues, IgnoreList); - Optional> Order = V.bestOrder(); - if (Order) { - assert(Order->size() == VL.size() && - "Order size must be the same as number of vectorized " - "instructions."); - // TODO: reorder tree nodes without tree rebuilding. - SmallVector ReorderedOps(VL.size()); - transform(fixupOrderingIndices(*Order), ReorderedOps.begin(), - [VL](const unsigned Idx) { return VL[Idx]; }); - V.buildTree(ReorderedOps, ExternallyUsedValues, IgnoreList); - } + V.buildTree(VL, IgnoreList); if (V.isTreeTinyAndNotFullyVectorizable()) break; if (V.isLoadCombineReductionCandidate(RdxKind)) break; + V.reorderTopToBottom(/*FreeReorder=*/true); + V.reorderBottomToTop(/*FreeReorder=*/true); + V.buildExternalUses(ExternallyUsedValues); V.computeMinimumValueSizes(); @@ -8239,7 +8628,7 @@ LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n"); // Aggregate value is unlikely to be processed in vector register, we need to // extract scalars into scalar registers, so NeedExtraction is set true. - return tryToVectorizeList(BuildVectorOpds, R, /*AllowReorder=*/false); + return tryToVectorizeList(BuildVectorOpds, R, /*AllowReorder=*/true); } bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI, diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll @@ -6,16 +6,14 @@ define <2 x i64> @build_vec_v2i64(<2 x i64> %v0, <2 x i64> %v1) { ; CHECK-LABEL: @build_vec_v2i64( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[V0:%.*]], <2 x i64> undef, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[V1:%.*]], <2 x i64> undef, <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = sub <2 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP7:%.*]] = sub <2 x i64> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> [[TMP7]], <2 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i64> [[TMP8]], [[TMP5]] -; CHECK-NEXT: ret <2 x i64> [[TMP9]] +; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i64> [[V0:%.*]], [[V1:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i64> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP5:%.*]] = sub <2 x i64> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <2 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i64> [[TMP6]], [[TMP3]] +; CHECK-NEXT: ret <2 x i64> [[TMP7]] ; %v0.0 = extractelement <2 x i64> %v0, i32 0 %v0.1 = extractelement <2 x i64> %v0, i32 1 @@ -74,16 +72,14 @@ define <4 x i32> @build_vec_v4i32(<4 x i32> %v0, <4 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP7:%.*]] = sub <4 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP8]], [[TMP5]] -; CHECK-NEXT: ret <4 x i32> [[TMP9]] +; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[V0:%.*]], [[V1:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = sub <4 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP5:%.*]] = sub <4 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i32> [[TMP6]], [[TMP3]] +; CHECK-NEXT: ret <4 x i32> [[TMP7]] ; %v0.0 = extractelement <4 x i32> %v0, i32 0 %v0.1 = extractelement <4 x i32> %v0, i32 1 @@ -114,16 +110,14 @@ define <4 x i32> @build_vec_v4i32_reuse_0(<2 x i32> %v0, <2 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32_reuse_0( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> undef, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> undef, <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = sub <2 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP7:%.*]] = sub <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> [[TMP7]], <2 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i32> [[TMP8]], [[TMP5]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i32> [[V0:%.*]], [[V1:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP5:%.*]] = sub <2 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i32> [[TMP6]], [[TMP3]] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[SHUFFLE]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 @@ -229,22 +223,20 @@ define i32 @reduction_v4i32(<4 x i32> %v0, <4 x i32> %v1) { ; CHECK-LABEL: @reduction_v4i32( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP8]], [[TMP5]] -; CHECK-NEXT: [[TMP10:%.*]] = lshr <4 x i32> [[TMP9]], -; CHECK-NEXT: [[TMP11:%.*]] = and <4 x i32> [[TMP10]], -; CHECK-NEXT: [[TMP12:%.*]] = mul nuw <4 x i32> [[TMP11]], -; CHECK-NEXT: [[TMP13:%.*]] = add <4 x i32> [[TMP12]], [[TMP9]] -; CHECK-NEXT: [[TMP14:%.*]] = xor <4 x i32> [[TMP13]], [[TMP12]] -; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP14]]) -; CHECK-NEXT: ret i32 [[TMP15]] +; CHECK-NEXT: [[TMP1:%.*]] = sub <4 x i32> [[V0:%.*]], [[V1:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = sub <4 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i32> [[TMP6]], [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = lshr <4 x i32> [[TMP7]], +; CHECK-NEXT: [[TMP9:%.*]] = and <4 x i32> [[TMP8]], +; CHECK-NEXT: [[TMP10:%.*]] = mul nuw <4 x i32> [[TMP9]], +; CHECK-NEXT: [[TMP11:%.*]] = add <4 x i32> [[TMP10]], [[TMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = xor <4 x i32> [[TMP11]], [[TMP10]] +; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP12]]) +; CHECK-NEXT: ret i32 [[TMP13]] ; %v0.0 = extractelement <4 x i32> %v0, i32 0 %v0.1 = extractelement <4 x i32> %v0, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll @@ -6,16 +6,14 @@ define <2 x i64> @build_vec_v2i64(<2 x i64> %v0, <2 x i64> %v1) { ; CHECK-LABEL: @build_vec_v2i64( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[V0:%.*]], <2 x i64> undef, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[V1:%.*]], <2 x i64> undef, <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = sub <2 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP7:%.*]] = sub <2 x i64> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> [[TMP7]], <2 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i64> [[TMP8]], [[TMP5]] -; CHECK-NEXT: ret <2 x i64> [[TMP9]] +; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i64> [[V0:%.*]], [[V1:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i64> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP5:%.*]] = sub <2 x i64> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <2 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i64> [[TMP6]], [[TMP3]] +; CHECK-NEXT: ret <2 x i64> [[TMP7]] ; %v0.0 = extractelement <2 x i64> %v0, i32 0 %v0.1 = extractelement <2 x i64> %v0, i32 1 @@ -74,16 +72,14 @@ define <4 x i32> @build_vec_v4i32(<4 x i32> %v0, <4 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP7:%.*]] = sub <4 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP8]], [[TMP5]] -; CHECK-NEXT: ret <4 x i32> [[TMP9]] +; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[V0:%.*]], [[V1:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = sub <4 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP5:%.*]] = sub <4 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i32> [[TMP6]], [[TMP3]] +; CHECK-NEXT: ret <4 x i32> [[TMP7]] ; %v0.0 = extractelement <4 x i32> %v0, i32 0 %v0.1 = extractelement <4 x i32> %v0, i32 1 @@ -114,16 +110,14 @@ define <4 x i32> @build_vec_v4i32_reuse_0(<2 x i32> %v0, <2 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32_reuse_0( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> undef, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> undef, <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = sub <2 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP7:%.*]] = sub <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> [[TMP7]], <2 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i32> [[TMP8]], [[TMP5]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i32> [[V0:%.*]], [[V1:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP5:%.*]] = sub <2 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i32> [[TMP6]], [[TMP3]] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[SHUFFLE]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 @@ -229,22 +223,20 @@ define i32 @reduction_v4i32(<4 x i32> %v0, <4 x i32> %v1) { ; CHECK-LABEL: @reduction_v4i32( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP8]], [[TMP5]] -; CHECK-NEXT: [[TMP10:%.*]] = lshr <4 x i32> [[TMP9]], -; CHECK-NEXT: [[TMP11:%.*]] = and <4 x i32> [[TMP10]], -; CHECK-NEXT: [[TMP12:%.*]] = mul nuw <4 x i32> [[TMP11]], -; CHECK-NEXT: [[TMP13:%.*]] = add <4 x i32> [[TMP12]], [[TMP9]] -; CHECK-NEXT: [[TMP14:%.*]] = xor <4 x i32> [[TMP13]], [[TMP12]] -; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP14]]) -; CHECK-NEXT: ret i32 [[TMP15]] +; CHECK-NEXT: [[TMP1:%.*]] = sub <4 x i32> [[V0:%.*]], [[V1:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = sub <4 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i32> [[TMP6]], [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = lshr <4 x i32> [[TMP7]], +; CHECK-NEXT: [[TMP9:%.*]] = and <4 x i32> [[TMP8]], +; CHECK-NEXT: [[TMP10:%.*]] = mul nuw <4 x i32> [[TMP9]], +; CHECK-NEXT: [[TMP11:%.*]] = add <4 x i32> [[TMP10]], [[TMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = xor <4 x i32> [[TMP11]], [[TMP10]] +; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP12]]) +; CHECK-NEXT: ret i32 [[TMP13]] ; %v0.0 = extractelement <4 x i32> %v0, i32 0 %v0.1 = extractelement <4 x i32> %v0, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll b/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll @@ -338,32 +338,30 @@ ret void } -; Dont vectorization of following code for float data type as sub is not commutative- -; fc[0] = fb[0]+fa[0]; -; fc[1] = fa[1]-fb[1]; -; fc[2] = fa[2]+fb[2]; -; fc[3] = fb[3]-fa[3]; -; In the above code we can swap the 1st and 2nd operation as fadd is commutative -; but not 2nd or 4th as fsub is not commutative. - -define void @no_vec_shuff_reorder() #0 { -; CHECK-LABEL: @no_vec_shuff_reorder( -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 0), align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 0), align 4 -; CHECK-NEXT: [[TMP3:%.*]] = fadd float [[TMP1]], [[TMP2]] -; CHECK-NEXT: store float [[TMP3]], float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 0), align 4 -; CHECK-NEXT: [[TMP4:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 1), align 4 -; CHECK-NEXT: [[TMP5:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 1), align 4 -; CHECK-NEXT: [[TMP6:%.*]] = fsub float [[TMP4]], [[TMP5]] -; CHECK-NEXT: store float [[TMP6]], float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 1), align 4 -; CHECK-NEXT: [[TMP7:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 2), align 4 -; CHECK-NEXT: [[TMP8:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 2), align 4 -; CHECK-NEXT: [[TMP9:%.*]] = fadd float [[TMP7]], [[TMP8]] -; CHECK-NEXT: store float [[TMP9]], float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 2), align 4 -; CHECK-NEXT: [[TMP10:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 3), align 4 -; CHECK-NEXT: [[TMP11:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 3), align 4 -; CHECK-NEXT: [[TMP12:%.*]] = fsub float [[TMP10]], [[TMP11]] -; CHECK-NEXT: store float [[TMP12]], float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 3), align 4 +define void @vec_shuff_reorder() #0 { +; CHECK-LABEL: @vec_shuff_reorder( +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([4 x float]* @fa to <2 x float>*), align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast ([4 x float]* @fb to <2 x float>*), align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 2), align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 2), align 4 +; CHECK-NEXT: [[TMP5:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 3), align 4 +; CHECK-NEXT: [[TMP6:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 3), align 4 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x float> poison, <4 x float> [[TMP7]], <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP3]], i32 2 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP5]], i32 3 +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x float> poison, <4 x float> [[TMP11]], <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x float> [[TMP12]], float [[TMP4]], i32 2 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x float> [[TMP13]], float [[TMP6]], i32 3 +; CHECK-NEXT: [[TMP15:%.*]] = fadd <4 x float> [[TMP10]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = fsub <4 x float> [[TMP10]], [[TMP14]] +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x float> [[TMP15]], <4 x float> [[TMP16]], <4 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <2 x float> [[TMP1]], i32 0 +; CHECK-NEXT: store <4 x float> [[TMP17]], <4 x float>* bitcast ([4 x float]* @fc to <4 x float>*), align 4 ; CHECK-NEXT: ret void ; %1 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 0), align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll @@ -61,12 +61,12 @@ ; AVX-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[DEST:%.*]], i64 [[INDVARS_IV]] ; AVX-NEXT: store float [[ACC1_056]], float* [[ARRAYIDX2]], align 4 -; AVX-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> poison, <2 x i32> ; AVX-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0 ; AVX-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP1]], i32 1 -; AVX-NEXT: [[TMP4:%.*]] = fadd <2 x float> [[SHUFFLE]], [[TMP3]] +; AVX-NEXT: [[TMP4:%.*]] = fadd <2 x float> [[TMP0]], [[TMP3]] +; AVX-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> ; AVX-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP0]], zeroinitializer -; AVX-NEXT: [[TMP6:%.*]] = fadd <2 x float> [[TMP5]], [[TMP4]] +; AVX-NEXT: [[TMP6:%.*]] = fadd <2 x float> [[TMP5]], [[SHUFFLE]] ; AVX-NEXT: [[TMP7:%.*]] = fcmp olt <2 x float> [[TMP6]], ; AVX-NEXT: [[TMP8:%.*]] = select <2 x i1> [[TMP7]], <2 x float> [[TMP6]], <2 x float> ; AVX-NEXT: [[TMP9:%.*]] = fcmp olt <2 x float> [[TMP8]], diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/extract.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extract.ll @@ -30,11 +30,11 @@ ; CHECK-LABEL: @fextr1( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[LD:%.*]] = load <2 x double>, <2 x double>* undef, align 16 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[LD]], <2 x double> poison, <2 x i32> ; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds double, double* [[PTR:%.*]], i64 0 -; CHECK-NEXT: [[TMP0:%.*]] = fadd <2 x double> [[SHUFFLE]], +; CHECK-NEXT: [[TMP0:%.*]] = fadd <2 x double> [[LD]], +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[P1]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP0]], <2 x double>* [[TMP1]], align 4 +; CHECK-NEXT: store <2 x double> [[SHUFFLE]], <2 x double>* [[TMP1]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll b/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll @@ -8,12 +8,12 @@ ; CHECK-LABEL: @fn1( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([4 x i32]* @b to <4 x i32>*), align 4 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i32> [[SHUFFLE]], zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> , i32 [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP3]], <4 x i32> -; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast ([4 x i32]* @a to <4 x i32>*), align 4 +; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i32> [[TMP0]], zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> , i32 [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: store <4 x i32> [[SHUFFLE]], <4 x i32>* bitcast ([4 x i32]* @a to <4 x i32>*), align 4 ; CHECK-NEXT: ret i32 0 ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load.ll b/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load.ll @@ -11,21 +11,21 @@ ; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>* ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: [[INN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[INN:%.*]], i64 0 ; CHECK-NEXT: [[GEP_4:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 2 ; CHECK-NEXT: [[GEP_5:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 3 ; CHECK-NEXT: [[GEP_6:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 1 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[INN_ADDR]] to <4 x i32>* +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[GEP_5]] to <4 x i32>* ; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i32> [[SHUFFLE]], [[SHUFFLE1]] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i32> [[TMP2]], [[SHUFFLE]] +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: [[GEP_7:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 0 ; CHECK-NEXT: [[GEP_8:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 1 ; CHECK-NEXT: [[GEP_9:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 2 ; CHECK-NEXT: [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3 ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4 +; CHECK-NEXT: store <4 x i32> [[SHUFFLE1]], <4 x i32>* [[TMP6]], align 4 ; CHECK-NEXT: ret i32 undef ; %in.addr = getelementptr inbounds i32, i32* %in, i64 0 @@ -69,22 +69,22 @@ ; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>* ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP5]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 3 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0 ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP7]], i32 2 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP9]], i32 3 -; CHECK-NEXT: [[TMP11:%.*]] = mul <4 x i32> [[SHUFFLE]], [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = mul <4 x i32> [[TMP2]], [[TMP10]] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: [[GEP_7:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 0 ; CHECK-NEXT: [[GEP_8:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 1 ; CHECK-NEXT: [[GEP_9:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 2 ; CHECK-NEXT: [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3 ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* [[TMP12]], align 4 +; CHECK-NEXT: store <4 x i32> [[SHUFFLE]], <4 x i32>* [[TMP12]], align 4 ; CHECK-NEXT: ret i32 undef ; %in.addr = getelementptr inbounds i32, i32* %in, i64 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll b/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll @@ -26,32 +26,31 @@ ; CHECK-NEXT: [[TMP6:%.*]] = sitofp <2 x i32> [[TMP5]] to <2 x float> ; CHECK-NEXT: [[TMP7:%.*]] = fmul <2 x float> [[TMP6]], ; CHECK-NEXT: [[TMP8:%.*]] = fsub <2 x float> , [[TMP7]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 0 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 1 ; CHECK-NEXT: store float [[TMP9]], float* @g, align 4 -; CHECK-NEXT: [[TMP10:%.*]] = fadd <4 x float> [[SHUFFLE]], -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[TMP10]], i32 3 +; CHECK-NEXT: [[TMP10:%.*]] = fadd <4 x float> [[SHUFFLE]], +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[TMP10]], i32 2 ; CHECK-NEXT: store float [[TMP11]], float* @c, align 4 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[TMP10]], i32 2 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[TMP10]], i32 0 ; CHECK-NEXT: store float [[TMP12]], float* @d, align 4 -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP10]], i32 1 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP10]], i32 3 ; CHECK-NEXT: store float [[TMP13]], float* @e, align 4 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[TMP10]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[TMP10]], i32 1 ; CHECK-NEXT: store float [[TMP14]], float* @f, align 4 ; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 14 ; CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 15 ; CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* @a, align 4 ; CHECK-NEXT: [[CONV19:%.*]] = sitofp i32 [[TMP15]] to float -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x float> , float [[CONV19]], i32 2 -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 2 -; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x float> [[TMP16]], float [[TMP17]], i32 3 -; CHECK-NEXT: [[TMP19:%.*]] = fadd <4 x float> [[TMP10]], [[TMP18]] -; CHECK-NEXT: [[TMP20:%.*]] = fsub <4 x float> [[TMP10]], [[TMP18]] -; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <4 x float> [[TMP19]], <4 x float> [[TMP20]], <4 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x float> , float [[CONV19]], i32 0 +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x float> [[TMP16]], float [[TMP17]], i32 2 +; CHECK-NEXT: [[TMP19:%.*]] = fsub <4 x float> [[TMP10]], [[TMP18]] +; CHECK-NEXT: [[TMP20:%.*]] = fadd <4 x float> [[TMP10]], [[TMP18]] +; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <4 x float> [[TMP19]], <4 x float> [[TMP20]], <4 x i32> ; CHECK-NEXT: [[TMP22:%.*]] = fptosi <4 x float> [[TMP21]] to <4 x i32> -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP22]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP23:%.*]] = bitcast i32* [[ARRAYIDX1]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[SHUFFLE1]], <4 x i32>* [[TMP23]], align 4 +; CHECK-NEXT: store <4 x i32> [[TMP22]], <4 x i32>* [[TMP23]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll @@ -15,8 +15,8 @@ ; CHECK-NEXT: [[TMP1:%.*]] = sext <2 x i16> [[TMP0]] to <2 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = sub nsw <2 x i32> , [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = sub <2 x i32> [[TMP2]], undef -; CHECK-NEXT: [[SHUFFLE10:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[SHUFFLE10]], +; CHECK-NEXT: [[SHUFFLE10:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[SHUFFLE10]], ; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP4]]) ; CHECK-NEXT: [[T19:%.*]] = select i1 undef, i32 [[TMP5]], i32 undef ; CHECK-NEXT: [[T20:%.*]] = icmp sgt i32 [[T19]], 63 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll @@ -7,15 +7,15 @@ ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 1 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARR]] to <2 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[A7:%.*]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A8:%.*]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A1:%.*]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A2:%.*]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A3:%.*]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A4:%.*]], i32 5 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A5:%.*]], i32 6 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A6:%.*]], i32 7 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[A1:%.*]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A2:%.*]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A3:%.*]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A4:%.*]], i32 3 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A6:%.*]], i32 5 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.*]], i32 6 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A8:%.*]], i32 7 ; CHECK-NEXT: [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> [[TMP10]]) ; CHECK-NEXT: ret i32 [[TMP11]] @@ -57,15 +57,15 @@ ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[ARR]], i64 3 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARR]] to <4 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[A6:%.*]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A1:%.*]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A4:%.*]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A5:%.*]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A8:%.*]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A2:%.*]], i32 5 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[A1:%.*]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A2:%.*]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A3:%.*]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A4:%.*]], i32 3 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A6:%.*]], i32 5 ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.*]], i32 6 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A3:%.*]], i32 7 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A8:%.*]], i32 7 ; CHECK-NEXT: [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> [[TMP10]]) ; CHECK-NEXT: ret i32 [[TMP11]] @@ -111,15 +111,15 @@ ; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[ARR]], i64 1 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARR]] to <4 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[A4:%.*]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A6:%.*]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A5:%.*]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A8:%.*]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A2:%.*]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A7:%.*]], i32 5 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A1:%.*]], i32 6 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A3:%.*]], i32 7 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[A1:%.*]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A2:%.*]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A3:%.*]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A4:%.*]], i32 3 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A6:%.*]], i32 5 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.*]], i32 6 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A8:%.*]], i32 7 ; CHECK-NEXT: [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> [[TMP10]]) ; CHECK-NEXT: ret i32 [[TMP11]]