diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1904,6 +1904,41 @@ ~BoUpSLP(); private: + /// Check if the operands on the edges \p Edges of the \p UserTE allows + /// reordering (i.e. the operands can be reordered because they have only one + /// user and reordarable). + /// \param NonVectorized List of all gather nodes that require reordering + /// (e.g., gather of extractlements or partially vectorizable loads). + /// \param GatherOps List of gather operand nodes for \p UserTE that require + /// reordering, subset of \p NonVectorized. + bool + canReorderOperands(TreeEntry *UserTE, + SmallVectorImpl> &Edges, + ArrayRef ReorderableGathers, + SmallVectorImpl &GatherOps); + + /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph, + /// if any. If it is not vectorized (gather node), returns nullptr. + TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) { + ArrayRef VL = UserTE->getOperand(OpIdx); + TreeEntry *TE = nullptr; + const auto *It = find_if(VL, [this, &TE](Value *V) { + TE = getTreeEntry(V); + return TE; + }); + if (It != VL.end() && TE->isSame(VL)) + return TE; + return nullptr; + } + + /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph, + /// if any. If it is not vectorized (gather node), returns nullptr. + const TreeEntry *getVectorizedOperand(const TreeEntry *UserTE, + unsigned OpIdx) const { + return const_cast(this)->getVectorizedOperand( + const_cast(UserTE), OpIdx); + } + /// Checks if all users of \p I are the part of the vectorization tree. bool areAllUsersVectorized(Instruction *I, ArrayRef VectorizedVals) const; @@ -3383,6 +3418,44 @@ } } +bool BoUpSLP::canReorderOperands( + TreeEntry *UserTE, SmallVectorImpl> &Edges, + ArrayRef ReorderableGathers, + SmallVectorImpl &GatherOps) { + for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) { + if (any_of(Edges, [I](const std::pair &OpData) { + return OpData.first == I && + OpData.second->State == TreeEntry::Vectorize; + })) + continue; + if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) { + // Do not reorder if operand node is used by many user nodes. + if (any_of(TE->UserTreeIndices, + [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; })) + return false; + // Add the node to the list of the ordered nodes with the identity + // order. + Edges.emplace_back(I, TE); + continue; + } + ArrayRef VL = UserTE->getOperand(I); + TreeEntry *Gather = nullptr; + if (count_if(ReorderableGathers, [VL, &Gather](TreeEntry *TE) { + assert(TE->State != TreeEntry::Vectorize && + "Only non-vectorized nodes are expected."); + if (TE->isSame(VL)) { + Gather = TE; + return true; + } + return false; + }) > 1) + return false; + if (Gather) + GatherOps.push_back(Gather); + } + return true; +} + void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { SetVector OrderedEntries; DenseMap GathersToOrders; @@ -3403,42 +3476,6 @@ } }); - // Checks if the operands of the users are reordarable and have only single - // use. - auto &&CheckOperands = - [this, &NonVectorized](const auto &Data, - SmallVectorImpl &GatherOps) { - for (unsigned I = 0, E = Data.first->getNumOperands(); I < E; ++I) { - if (any_of(Data.second, - [I](const std::pair &OpData) { - return OpData.first == I && - OpData.second->State == TreeEntry::Vectorize; - })) - continue; - ArrayRef VL = Data.first->getOperand(I); - const TreeEntry *TE = nullptr; - const auto *It = find_if(VL, [this, &TE](Value *V) { - TE = getTreeEntry(V); - return TE; - }); - if (It != VL.end() && TE->isSame(VL)) - return false; - TreeEntry *Gather = nullptr; - if (count_if(NonVectorized, [VL, &Gather](TreeEntry *TE) { - assert(TE->State != TreeEntry::Vectorize && - "Only non-vectorized nodes are expected."); - if (TE->isSame(VL)) { - Gather = TE; - return true; - } - return false; - }) > 1) - return false; - if (Gather) - GatherOps.push_back(Gather); - } - return true; - }; // 1. Propagate order to the graph nodes, which use only reordered nodes. // I.e., if the node has operands, that are reordered, try to make at least // one operand order in the natural order and reorder others + reorder the @@ -3475,10 +3512,11 @@ // Erase filtered entries. for_each(Filtered, [&OrderedEntries](TreeEntry *TE) { OrderedEntries.remove(TE); }); - for (const auto &Data : Users) { + for (auto &Data : Users) { // Check that operands are used only in the User node. SmallVector GatherOps; - if (!CheckOperands(Data, GatherOps)) { + if (!canReorderOperands(Data.first, Data.second, NonVectorized, + GatherOps)) { for_each(Data.second, [&OrderedEntries](const std::pair &Op) { OrderedEntries.remove(Op.second); @@ -3494,6 +3532,7 @@ // the same node my be considered several times, though might be not // profitable. SmallPtrSet VisitedOps; + SmallPtrSet VisitedUsers; for (const auto &Op : Data.second) { TreeEntry *OpTE = Op.second; if (!VisitedOps.insert(OpTE).second) @@ -3506,6 +3545,10 @@ return GathersToOrders.find(OpTE)->second; return OpTE->ReorderIndices; }(); + unsigned NumOps = count_if( + Data.second, [OpTE](const std::pair &P) { + return P.second == OpTE; + }); // Stores actually store the mask, not the order, need to invert. if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() && OpTE->getOpcode() == Instruction::Store && !Order.empty()) { @@ -3517,14 +3560,52 @@ return Idx == UndefMaskElem ? E : static_cast(Idx); }); fixupOrderingIndices(CurrentOrder); - ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second; + OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second += + NumOps; } else { - ++OrdersUses.insert(std::make_pair(Order, 0)).first->second; + OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps; + } + auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0)); + const auto &&AllowsReordering = [IgnoreReorder, &GathersToOrders]( + const TreeEntry *TE) { + if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() || + (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) || + (IgnoreReorder && TE->Idx == 0)) + return true; + if (TE->State == TreeEntry::NeedToGather) { + auto It = GathersToOrders.find(TE); + if (It != GathersToOrders.end()) + return !It->second.empty(); + return true; + } + return false; + }; + for (const EdgeInfo &EI : OpTE->UserTreeIndices) { + TreeEntry *UserTE = EI.UserTE; + if (!VisitedUsers.insert(UserTE).second) + continue; + // May reorder user node if it requires reordering, has reused + // scalars, is an alternate op vectorize node or its op nodes require + // reordering. + if (AllowsReordering(UserTE)) + continue; + // Check if users allow reordering. + // Currently look up just 1 level of operands to avoid increase of + // the compile time. + // Profitable to reorder if definitely more operands allow + // reordering rather than those with natural order. + ArrayRef> Ops = Users[UserTE]; + if (static_cast(count_if( + Ops, [UserTE, &AllowsReordering]( + const std::pair &Op) { + return AllowsReordering(Op.second) && + all_of(Op.second->UserTreeIndices, + [UserTE](const EdgeInfo &EI) { + return EI.UserTE == UserTE; + }); + })) <= Ops.size() / 2) + ++Res.first->second; } - OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second += - OpTE->UserTreeIndices.size(); - assert(OrdersUses[{}] > 0 && "Counter cannot be less than 0."); - --OrdersUses[{}]; } // If no orders - skip current nodes and jump to the next one, if any. if (OrdersUses.empty()) { @@ -3565,7 +3646,7 @@ OrderedEntries.remove(TE); if (!VisitedOps.insert(TE).second) continue; - if (!TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty()) { + if (TE->ReuseShuffleIndices.size() == BestOrder.size()) { // Just reorder reuses indices. reorderReuses(TE->ReuseShuffleIndices, Mask); continue; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll @@ -18,19 +18,16 @@ ; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 3 ; CHECK-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>* ; CHECK-NEXT: [[TMP18:%.*]] = load <4 x i32>, <4 x i32>* [[TMP17]], align 4 -; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <4 x i32> [[TMP18]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* ; CHECK-NEXT: [[TMP20:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4 ; CHECK-NEXT: [[TMP21:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>* ; CHECK-NEXT: [[TMP22:%.*]] = load <4 x i32>, <4 x i32>* [[TMP21]], align 4 -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP22]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP23:%.*]] = sub <4 x i32> , [[TMP20]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP23]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP24:%.*]] = sub <4 x i32> [[SHUFFLE]], [[SHUFFLE1]] -; CHECK-NEXT: [[TMP25:%.*]] = add <4 x i32> [[TMP24]], [[SHUFFLE2]] -; CHECK-NEXT: [[TMP26:%.*]] = add <4 x i32> [[TMP25]], -; CHECK-NEXT: [[TMP27:%.*]] = sub <4 x i32> [[TMP25]], -; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <4 x i32> [[TMP26]], <4 x i32> [[TMP27]], <4 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = sub <4 x i32> [[TMP23]], [[TMP22]] +; CHECK-NEXT: [[TMP25:%.*]] = add <4 x i32> [[TMP24]], [[TMP18]] +; CHECK-NEXT: [[TMP26:%.*]] = add <4 x i32> [[TMP25]], +; CHECK-NEXT: [[TMP27:%.*]] = sub <4 x i32> [[TMP25]], +; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <4 x i32> [[TMP26]], <4 x i32> [[TMP27]], <4 x i32> ; CHECK-NEXT: [[TMP29:%.*]] = add <4 x i32> [[TMP28]], zeroinitializer ; CHECK-NEXT: [[TMP30:%.*]] = sub <4 x i32> [[TMP28]], zeroinitializer ; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <4 x i32> [[TMP29]], <4 x i32> [[TMP30]], <4 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reordered-top-scalars.ll b/llvm/test/Transforms/SLPVectorizer/X86/reordered-top-scalars.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reordered-top-scalars.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reordered-top-scalars.ll @@ -7,10 +7,9 @@ ; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, i32* [[ISEC:%.*]], i32 0 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARRAYIDX10]] to <2 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 8 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP4]], i32 1 ; CHECK-NEXT: br i1 false, label [[BLOCK1:%.*]], label [[BLOCK3:%.*]] ; CHECK: block1: @@ -18,10 +17,10 @@ ; CHECK: block2: ; CHECK-NEXT: br label [[BLOCK3]] ; CHECK: block3: -; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x i32> [ [[SHUFFLE]], [[BLOCK1]] ], [ [[SHUFFLE]], [[BLOCK2]] ], [ [[TMP5]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x i32> [ [[TMP1]], [[BLOCK1]] ], [ [[TMP1]], [[BLOCK2]] ], [ [[TMP5]], [[ENTRY:%.*]] ] ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP6]], i32 0 ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i32> [[TMP6]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = mul i32 [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = mul i32 [[TMP8]], [[TMP7]] ; CHECK-NEXT: ret i32 [[TMP9]] ; entry: