diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -504,6 +504,15 @@ namespace llvm { +static void inversePermutation(ArrayRef Indices, + SmallVectorImpl &Mask) { + const unsigned E = Indices.size(); + Mask.clear(); + Mask.resize(E, E + 1); + for (unsigned I = 0; I < E; ++I) + Mask[Indices[I]] = I; +} + namespace slpvectorizer { /// Bottom Up SLP Vectorizer. @@ -518,6 +527,7 @@ using StoreList = SmallVector; using ExtraValueToDebugLocsMap = MapVector>; + using OrdersType = SmallVector; BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li, @@ -595,6 +605,14 @@ /// \returns The best order of instructions for vectorization. Optional> bestOrder() const { + assert(llvm::all_of(NumOpsWantToKeepOrder, + [this](const decltype( + NumOpsWantToKeepOrder)::value_type &D) { + return D.getFirst().size() == + VectorizableTree[0]->Scalars.size(); + }) && + "All orders must have the same size as number of instructions in " + "tree node."); auto I = std::max_element( NumOpsWantToKeepOrder.begin(), NumOpsWantToKeepOrder.end(), [](const decltype(NumOpsWantToKeepOrder)::value_type &D1, @@ -608,6 +626,76 @@ return makeArrayRef(I->getFirst()); } + /// Builds the correct order for root instructions. + /// If some leaves have the same instructions to be vectorized, we may + /// incorrectly evaluate the best order for the root node (it is built for the + /// vector of instructions without repeated instructions and, thus, has less + /// elements than the root node). This function builds the correct order for + /// the root node. + /// For example, if the root node is \, then the leaves + /// are \ and \. When we try to vectorize the first + /// leaf, it will be shrink to \. If instructions in this leaf should + /// be reordered, the best order will be \<1, 0\>. We need to extend this + /// order for the root node. For the root node this order should look like + /// \<3, 0, 1, 2\>. This function extends the order for the reused + /// instructions. + void findRootOrder(OrdersType &Order) { + // If the leaf has the same number of instructions to vectorize as the root + // - order must be set already. + unsigned RootSize = VectorizableTree[0]->Scalars.size(); + if (Order.size() == RootSize) + return; + OrdersType RealOrder; + std::swap(Order, RealOrder); + inversePermutation(RealOrder, Order); + // The leaf has less number of instructions - need to find the true order of + // the root. + // Scan the nodes starting from the leaf back to the root. + unsigned Idx = VectorizableTree.size() - 1; + while (!VectorizableTree[Idx]->UserTreeIndices.empty()) { + const TreeEntry &Node = *VectorizableTree[Idx].get(); + Idx = Node.UserTreeIndices.back().EdgeIdx; + if (Node.ReuseShuffleIndices.empty()) + continue; + // Build the order for the parent node. + OrdersType NewOrder(Node.ReuseShuffleIndices.size(), RootSize); + SmallVector OrderCounter(Order.size(), 0); + // The algorithm of the order extension is: + // 1. Calculate the number of the same instructions for the order. + // 2. Calculate the index of the new order: total number of instructions + // with order less than the order of the current instruction + reuse + // number of the current instruction. + // 3. The new order is just the index of the instruction in the original + // vector of the instructions. + for (unsigned I : Node.ReuseShuffleIndices) + ++OrderCounter[Order[I]]; + SmallVector CurrentCounter(Order.size(), 0); + for (unsigned I = 0, E = Node.ReuseShuffleIndices.size(); I < E; ++I) { + unsigned ReusedIdx = Node.ReuseShuffleIndices[I]; + unsigned OrderIdx = Order[ReusedIdx]; + unsigned NewIdx = 0; + for (unsigned J = 0; J < OrderIdx; ++J) + NewIdx += OrderCounter[J]; + NewIdx += CurrentCounter[OrderIdx]; + ++CurrentCounter[OrderIdx]; + assert(NewOrder[NewIdx] == RootSize && + "The order index should not be written already."); + NewOrder[NewIdx] = I; + } + std::swap(Order, NewOrder); + // If the size of the order is the same as number of instructions in the + // root node, no need to extend it more. + if (Order.size() == RootSize) + break; + } + assert((Idx == 0 || Order.size() == RootSize) && + "Root node is expected or the size of the order must be the same as " + "the number of elements in the root node."); + assert(llvm::all_of(Order, + [RootSize](unsigned Val) { return Val != RootSize; }) && + "All indices must be initialized"); + } + /// \return The vector element size in bits to use when vectorizing the /// expression tree ending at \p V. If V is a store, the size is the width of /// the stored value. Otherwise, the size is the width of the largest loaded @@ -1435,7 +1523,7 @@ SmallVector ReuseShuffleIndices; /// Does this entry require reordering? - ArrayRef ReorderIndices; + SmallVector ReorderIndices; /// Points back to the VectorizableTree. /// @@ -1620,7 +1708,7 @@ Last->NeedToGather = !Vectorized; Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(), ReuseShuffleIndices.end()); - Last->ReorderIndices = ReorderIndices; + Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end()); Last->setOperations(S); if (Vectorized) { for (int i = 0, e = VL.size(); i != e; ++i) { @@ -2140,7 +2228,6 @@ /// List of users to ignore during scheduling and that don't need extracting. ArrayRef UserIgnoreList; - using OrdersType = SmallVector; /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of /// sorted SmallVectors of unsigned. struct OrdersTypeDenseMapInfo { @@ -2601,12 +2688,10 @@ }); // Insert new order with initial value 0, if it does not exist, // otherwise return the iterator to the existing one. - auto StoredCurrentOrderAndNum = - NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first; - ++StoredCurrentOrderAndNum->getSecond(); - newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies, - StoredCurrentOrderAndNum->getFirst()); + newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies, CurrentOrder); + findRootOrder(CurrentOrder); + ++NumOpsWantToKeepOrder[CurrentOrder]; // This is a special case, as it does not gather, but at the same time // we are not extending buildTree_rec() towards the operands. ValueList Op0; @@ -2683,13 +2768,13 @@ LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n"); } else { // Need to reorder. - auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first; - ++I->getSecond(); TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies, I->getFirst()); + ReuseShuffleIndicies, CurrentOrder); TE->setOperandsInOrder(); LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n"); + findRootOrder(CurrentOrder); + ++NumOpsWantToKeepOrder[CurrentOrder]; } return; } @@ -2945,15 +3030,14 @@ buildTree_rec(Operands, Depth + 1, {TE, 0}); LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n"); } else { - // Need to reorder. - auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first; - ++(I->getSecond()); TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies, I->getFirst()); + ReuseShuffleIndicies, CurrentOrder); TE->setOperandsInOrder(); buildTree_rec(Operands, Depth + 1, {TE, 0}); LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n"); + findRootOrder(CurrentOrder); + ++NumOpsWantToKeepOrder[CurrentOrder]; } return; } @@ -3999,15 +4083,6 @@ return V; } -static void inversePermutation(ArrayRef Indices, - SmallVectorImpl &Mask) { - Mask.clear(); - const unsigned E = Indices.size(); - Mask.resize(E); - for (unsigned I = 0; I < E; ++I) - Mask[Indices[I]] = I; -} - Value *BoUpSLP::vectorizeTree(TreeEntry *E) { IRBuilder<>::InsertPointGuard Guard(Builder); @@ -6696,8 +6771,10 @@ auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth); V.buildTree(VL, ExternallyUsedValues, IgnoreList); Optional> Order = V.bestOrder(); - // TODO: Handle orders of size less than number of elements in the vector. - if (Order && Order->size() == VL.size()) { + if (Order) { + assert(Order->size() == VL.size() && + "Order size must be the same as number of vectorized " + "instructions."); // TODO: reorder tree nodes without tree rebuilding. SmallVector ReorderedOps(VL.size()); llvm::transform(*Order, ReorderedOps.begin(), diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll @@ -14,11 +14,10 @@ ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i16> undef, i16 [[T]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i16> [[TMP0]], i16 undef, i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = sext <2 x i16> [[TMP1]] to <2 x i32> -; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> undef, <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = sub nsw <2 x i32> , [[REORDER_SHUFFLE]] +; CHECK-NEXT: [[TMP3:%.*]] = sub nsw <2 x i32> , [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = sub <2 x i32> [[TMP3]], undef -; CHECK-NEXT: [[SHUFFLE8:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[SHUFFLE8]], +; CHECK-NEXT: [[SHUFFLE8:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[SHUFFLE8]], ; CHECK-NEXT: [[RDX_SHUF9:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <4 x i32> ; CHECK-NEXT: [[RDX_MINMAX_CMP10:%.*]] = icmp sgt <4 x i32> [[TMP5]], [[RDX_SHUF9]] ; CHECK-NEXT: [[RDX_MINMAX_SELECT11:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP10]], <4 x i32> [[TMP5]], <4 x i32> [[RDX_SHUF9]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll @@ -7,16 +7,15 @@ ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 1 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARR]] to <2 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <2 x i32> -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[REORDER_SHUFFLE]], <2 x i32> undef, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> undef, i32 [[A1:%.*]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A2:%.*]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A3:%.*]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A4:%.*]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A6:%.*]], i32 5 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.*]], i32 6 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A8:%.*]], i32 7 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> undef, i32 [[A7:%.*]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A8:%.*]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A1:%.*]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A2:%.*]], i32 3 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A3:%.*]], i32 4 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A4:%.*]], i32 5 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A5:%.*]], i32 6 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A6:%.*]], i32 7 ; CHECK-NEXT: [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]] ; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP10]], <8 x i32> undef, <8 x i32> ; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp ult <8 x i32> [[TMP10]], [[RDX_SHUF]] @@ -67,16 +66,15 @@ ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[ARR]], i64 3 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARR]] to <4 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[REORDER_SHUFFLE]], <4 x i32> undef, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> undef, i32 [[A1:%.*]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A2:%.*]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A3:%.*]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A4:%.*]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A6:%.*]], i32 5 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> undef, i32 [[A6:%.*]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A1:%.*]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A4:%.*]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A5:%.*]], i32 3 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A8:%.*]], i32 4 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A2:%.*]], i32 5 ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.*]], i32 6 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A8:%.*]], i32 7 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A3:%.*]], i32 7 ; CHECK-NEXT: [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]] ; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP10]], <8 x i32> undef, <8 x i32> ; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp ult <8 x i32> [[TMP10]], [[RDX_SHUF]] @@ -131,16 +129,15 @@ ; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[ARR]], i64 1 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARR]] to <4 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[REORDER_SHUFFLE]], <4 x i32> undef, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> undef, i32 [[A1:%.*]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A2:%.*]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A3:%.*]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A4:%.*]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A6:%.*]], i32 5 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.*]], i32 6 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A8:%.*]], i32 7 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> undef, i32 [[A4:%.*]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A6:%.*]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A5:%.*]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A8:%.*]], i32 3 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A2:%.*]], i32 4 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A7:%.*]], i32 5 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A1:%.*]], i32 6 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A3:%.*]], i32 7 ; CHECK-NEXT: [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]] ; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP10]], <8 x i32> undef, <8 x i32> ; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp ult <8 x i32> [[TMP10]], [[RDX_SHUF]]