diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3402,7 +3402,7 @@ // Checks if the operands of the users are reordarable and have only single // use. auto &&CheckOperands = - [this, &NonVectorized](const auto &Data, + [this, &NonVectorized](auto &Data, SmallVectorImpl &GatherOps) { for (unsigned I = 0, E = Data.first->getNumOperands(); I < E; ++I) { if (any_of(Data.second, @@ -3412,13 +3412,22 @@ })) continue; ArrayRef VL = Data.first->getOperand(I); - const TreeEntry *TE = nullptr; + TreeEntry *TE = nullptr; const auto *It = find_if(VL, [this, &TE](Value *V) { TE = getTreeEntry(V); return TE; }); - if (It != VL.end() && TE->isSame(VL)) - return false; + if (It != VL.end() && TE->isSame(VL)) { + // Do not reorder if operand node is used by many user nodes. + if (any_of(TE->UserTreeIndices, [&Data](const EdgeInfo &EI) { + return EI.UserTE != Data.first; + })) + return false; + // Add the node to the list of the ordered nodes with the identity + // order. + Data.second.emplace_back(I, TE); + continue; + } TreeEntry *Gather = nullptr; if (count_if(NonVectorized, [VL, &Gather](TreeEntry *TE) { assert(TE->State != TreeEntry::Vectorize && @@ -3471,7 +3480,7 @@ // Erase filtered entries. for_each(Filtered, [&OrderedEntries](TreeEntry *TE) { OrderedEntries.remove(TE); }); - for (const auto &Data : Users) { + for (auto &Data : Users) { // Check that operands are used only in the User node. SmallVector GatherOps; if (!CheckOperands(Data, GatherOps)) { @@ -3490,6 +3499,7 @@ // the same node my be considered several times, though might be not // profitable. SmallPtrSet VisitedOps; + SmallPtrSet VisitedUsers; for (const auto &Op : Data.second) { TreeEntry *OpTE = Op.second; if (!VisitedOps.insert(OpTE).second) @@ -3502,6 +3512,10 @@ return GathersToOrders.find(OpTE)->second; return OpTE->ReorderIndices; }(); + unsigned NumOps = count_if( + Data.second, [OpTE](const std::pair &P) { + return P.second == OpTE; + }); // Stores actually store the mask, not the order, need to invert. if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() && OpTE->getOpcode() == Instruction::Store && !Order.empty()) { @@ -3513,14 +3527,52 @@ return Idx == UndefMaskElem ? E : static_cast(Idx); }); fixupOrderingIndices(CurrentOrder); - ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second; + OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second += + NumOps; } else { - ++OrdersUses.insert(std::make_pair(Order, 0)).first->second; + OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps; + } + auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0)); + const auto &&AllowsReordering = [IgnoreReorder, &GathersToOrders]( + const TreeEntry *TE) { + if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() || + (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) || + (IgnoreReorder && TE->Idx == 0)) + return true; + if (TE->State == TreeEntry::NeedToGather) { + auto It = GathersToOrders.find(TE); + if (It != GathersToOrders.end()) + return !It->second.empty(); + return true; + } + return false; + }; + for (const EdgeInfo &EI : OpTE->UserTreeIndices) { + TreeEntry *UserTE = EI.UserTE; + if (!VisitedUsers.insert(UserTE).second) + continue; + // May reorder user node if it requires reordering, has reused + // scalars, is an alternate op vectorize node or its op nodes require + // reordering. + if (AllowsReordering(UserTE)) + continue; + // Check if users allow reordering. + // Currently look up just 1 level of operands to avoid increase of + // the compile time. + // Profitable to reorder if definitely more operands allow + // reordering rather than those with natural order. + ArrayRef> Ops = Users[UserTE]; + if (static_cast(count_if( + Ops, [UserTE, &AllowsReordering]( + const std::pair &Op) { + return AllowsReordering(Op.second) && + all_of(Op.second->UserTreeIndices, + [UserTE](const EdgeInfo &EI) { + return EI.UserTE == UserTE; + }); + })) <= Ops.size() / 2) + ++Res.first->second; } - OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second += - OpTE->UserTreeIndices.size(); - assert(OrdersUses[{}] > 0 && "Counter cannot be less than 0."); - --OrdersUses[{}]; } // If no orders - skip current nodes and jump to the next one, if any. if (OrdersUses.empty()) { @@ -3561,7 +3613,7 @@ OrderedEntries.remove(TE); if (!VisitedOps.insert(TE).second) continue; - if (!TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty()) { + if (TE->ReuseShuffleIndices.size() == BestOrder.size()) { // Just reorder reuses indices. reorderReuses(TE->ReuseShuffleIndices, Mask); continue; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll @@ -18,19 +18,16 @@ ; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 3 ; CHECK-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>* ; CHECK-NEXT: [[TMP18:%.*]] = load <4 x i32>, <4 x i32>* [[TMP17]], align 4 -; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <4 x i32> [[TMP18]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* ; CHECK-NEXT: [[TMP20:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4 ; CHECK-NEXT: [[TMP21:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>* ; CHECK-NEXT: [[TMP22:%.*]] = load <4 x i32>, <4 x i32>* [[TMP21]], align 4 -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP22]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP23:%.*]] = sub <4 x i32> , [[TMP20]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP23]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP24:%.*]] = sub <4 x i32> [[SHUFFLE]], [[SHUFFLE1]] -; CHECK-NEXT: [[TMP25:%.*]] = add <4 x i32> [[TMP24]], [[SHUFFLE2]] -; CHECK-NEXT: [[TMP26:%.*]] = add <4 x i32> [[TMP25]], -; CHECK-NEXT: [[TMP27:%.*]] = sub <4 x i32> [[TMP25]], -; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <4 x i32> [[TMP26]], <4 x i32> [[TMP27]], <4 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = sub <4 x i32> [[TMP23]], [[TMP22]] +; CHECK-NEXT: [[TMP25:%.*]] = add <4 x i32> [[TMP24]], [[TMP18]] +; CHECK-NEXT: [[TMP26:%.*]] = add <4 x i32> [[TMP25]], +; CHECK-NEXT: [[TMP27:%.*]] = sub <4 x i32> [[TMP25]], +; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <4 x i32> [[TMP26]], <4 x i32> [[TMP27]], <4 x i32> ; CHECK-NEXT: [[TMP29:%.*]] = add <4 x i32> [[TMP28]], zeroinitializer ; CHECK-NEXT: [[TMP30:%.*]] = sub <4 x i32> [[TMP28]], zeroinitializer ; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <4 x i32> [[TMP29]], <4 x i32> [[TMP30]], <4 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reordered-top-scalars.ll b/llvm/test/Transforms/SLPVectorizer/X86/reordered-top-scalars.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reordered-top-scalars.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reordered-top-scalars.ll @@ -7,10 +7,9 @@ ; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, i32* [[ISEC:%.*]], i32 0 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARRAYIDX10]] to <2 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 8 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP4]], i32 1 ; CHECK-NEXT: br i1 false, label [[BLOCK1:%.*]], label [[BLOCK3:%.*]] ; CHECK: block1: @@ -18,10 +17,10 @@ ; CHECK: block2: ; CHECK-NEXT: br label [[BLOCK3]] ; CHECK: block3: -; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x i32> [ [[SHUFFLE]], [[BLOCK1]] ], [ [[SHUFFLE]], [[BLOCK2]] ], [ [[TMP5]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x i32> [ [[TMP1]], [[BLOCK1]] ], [ [[TMP1]], [[BLOCK2]] ], [ [[TMP5]], [[ENTRY:%.*]] ] ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP6]], i32 0 ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i32> [[TMP6]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = mul i32 [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = mul i32 [[TMP8]], [[TMP7]] ; CHECK-NEXT: ret i32 [[TMP9]] ; entry: