diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -2620,7 +2620,12 @@ /// Do we need to gather this sequence or vectorize it /// (either with vector instruction or with scatter/gather /// intrinsics for store/load)? - enum EntryState { Vectorize, ScatterVectorize, NeedToGather }; + enum EntryState { + Vectorize, + ScatterVectorize, + PossibleStridedVectorize, + NeedToGather + }; EntryState State; /// Does this sequence require some shuffling? @@ -2796,6 +2801,9 @@ case ScatterVectorize: dbgs() << "ScatterVectorize\n"; break; + case PossibleStridedVectorize: + dbgs() << "PossibleStridedVectorize\n"; + break; case NeedToGather: dbgs() << "NeedToGather\n"; break; @@ -3703,7 +3711,8 @@ const BoUpSLP *) { if (Entry->State == TreeEntry::NeedToGather) return "color=red"; - if (Entry->State == TreeEntry::ScatterVectorize) + if (Entry->State == TreeEntry::ScatterVectorize || + Entry->State == TreeEntry::PossibleStridedVectorize) return "color=blue"; return ""; } @@ -3842,7 +3851,12 @@ namespace { /// Tracks the state we can represent the loads in the given sequence. -enum class LoadsState { Gather, Vectorize, ScatterVectorize }; +enum class LoadsState { + Gather, + Vectorize, + ScatterVectorize, + PossibleStridedVectorize +}; } // anonymous namespace static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, @@ -3902,6 +3916,7 @@ if (IsSorted || all_of(PointerOps, [&](Value *P) { return arePointersCompatible(P, PointerOps.front(), TLI); })) { + bool IsPossibleStrided = false; if (IsSorted) { Value *Ptr0; Value *PtrN; @@ -3917,6 +3932,8 @@ // Check that the sorted loads are consecutive. if (static_cast(*Diff) == VL.size() - 1) return LoadsState::Vectorize; + // Simple check if not a strided access - clear order. + IsPossibleStrided = *Diff % (VL.size() - 1) == 0; } // TODO: need to improve analysis of the pointers, if not all of them are // GEPs or have > 2 operands, we end up with a gather node, which just @@ -3938,7 +3955,8 @@ auto *VecTy = FixedVectorType::get(ScalarTy, VL.size()); if (TTI.isLegalMaskedGather(VecTy, CommonAlignment) && !TTI.forceScalarizeMaskedGather(VecTy, CommonAlignment)) - return LoadsState::ScatterVectorize; + return IsPossibleStrided ? LoadsState::PossibleStridedVectorize + : LoadsState::ScatterVectorize; } } @@ -4139,7 +4157,8 @@ return std::nullopt; // No need to reorder. return std::move(ResOrder); } - if (TE.State == TreeEntry::Vectorize && + if ((TE.State == TreeEntry::Vectorize || + TE.State == TreeEntry::PossibleStridedVectorize) && (isa(TE.getMainOp()) || (TopToBottom && isa(TE.getMainOp()))) && !TE.isAltShuffle()) @@ -4390,7 +4409,9 @@ ++Cnt; } VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get()); - if (TE->State != TreeEntry::Vectorize || !TE->ReuseShuffleIndices.empty()) + if (!(TE->State == TreeEntry::Vectorize || + TE->State == TreeEntry::PossibleStridedVectorize) || + !TE->ReuseShuffleIndices.empty()) GathersToOrders.try_emplace(TE.get(), *CurrentOrder); if (TE->State == TreeEntry::Vectorize && TE->getOpcode() == Instruction::PHI) @@ -4413,6 +4434,9 @@ MapVector> OrdersUses; + // Last chance orders - scatter vectorize. Try to use their orders if no + // other orders or the order is counted already. + SmallVector StridedVectorizeOrders; SmallPtrSet VisitedOps; for (const TreeEntry *OpTE : OrderedEntries) { // No need to reorder this nodes, still need to extend and to use shuffle, @@ -4459,6 +4483,11 @@ if (Order.empty()) continue; } + // Postpone scatter orders. + if (OpTE->State == TreeEntry::PossibleStridedVectorize) { + StridedVectorizeOrders.push_back(Order); + continue; + } // Stores actually store the mask, not the order, need to invert. if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() && OpTE->getOpcode() == Instruction::Store && !Order.empty()) { @@ -4476,8 +4505,21 @@ } } // Set order of the user node. - if (OrdersUses.empty()) - continue; + if (OrdersUses.empty()) { + if (StridedVectorizeOrders.empty()) + continue; + // Add (potentially!) strided vectorize orders. + for (OrdersType &Order : StridedVectorizeOrders) + ++OrdersUses.insert(std::make_pair(Order, 0)).first->second; + } else { + // Account (potentially!) strided vectorize orders only if it was used + // already. + for (OrdersType &Order : StridedVectorizeOrders) { + auto *It = OrdersUses.find(Order); + if (It != OrdersUses.end()) + ++It->second; + } + } // Choose the most used order. ArrayRef BestOrder = OrdersUses.front().first; unsigned Cnt = OrdersUses.front().second; @@ -4518,7 +4560,8 @@ } continue; } - if (TE->State == TreeEntry::Vectorize && + if ((TE->State == TreeEntry::Vectorize || + TE->State == TreeEntry::PossibleStridedVectorize) && isa(TE->getMainOp()) && !TE->isAltShuffle()) { @@ -4572,6 +4615,7 @@ // If there are reused scalars, process this node as a regular vectorize // node, just reorder reuses mask. if (TE->State != TreeEntry::Vectorize && + TE->State != TreeEntry::PossibleStridedVectorize && TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty()) GatherOps.push_back(TE); continue; @@ -4610,12 +4654,15 @@ for_each(VectorizableTree, [this, &OrderedEntries, &GathersToOrders, &NonVectorized]( const std::unique_ptr &TE) { - if (TE->State != TreeEntry::Vectorize) + if (TE->State != TreeEntry::Vectorize && + TE->State != TreeEntry::PossibleStridedVectorize) NonVectorized.push_back(TE.get()); if (std::optional CurrentOrder = getReorderingData(*TE, /*TopToBottom=*/false)) { OrderedEntries.insert(TE.get()); - if (TE->State != TreeEntry::Vectorize || !TE->ReuseShuffleIndices.empty()) + if (!(TE->State == TreeEntry::Vectorize || + TE->State == TreeEntry::PossibleStridedVectorize) || + !TE->ReuseShuffleIndices.empty()) GathersToOrders.try_emplace(TE.get(), *CurrentOrder); } }); @@ -4632,6 +4679,7 @@ SmallVector Filtered; for (TreeEntry *TE : OrderedEntries) { if (!(TE->State == TreeEntry::Vectorize || + TE->State == TreeEntry::PossibleStridedVectorize || (TE->State == TreeEntry::NeedToGather && GathersToOrders.count(TE))) || TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() || @@ -4678,6 +4726,9 @@ MapVector> OrdersUses; + // Last chance orders - scatter vectorize. Try to use their orders if no + // other orders or the order is counted already. + SmallVector> StridedVectorizeOrders; // Do the analysis for each tree entry only once, otherwise the order of // the same node my be considered several times, though might be not // profitable. @@ -4699,6 +4750,11 @@ Data.second, [OpTE](const std::pair &P) { return P.second == OpTE; }); + // Postpone scatter orders. + if (OpTE->State == TreeEntry::PossibleStridedVectorize) { + StridedVectorizeOrders.emplace_back(Order, NumOps); + continue; + } // Stores actually store the mask, not the order, need to invert. if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() && OpTE->getOpcode() == Instruction::Store && !Order.empty()) { @@ -4759,11 +4815,30 @@ } // If no orders - skip current nodes and jump to the next one, if any. if (OrdersUses.empty()) { - for_each(Data.second, - [&OrderedEntries](const std::pair &Op) { - OrderedEntries.remove(Op.second); - }); - continue; + if (StridedVectorizeOrders.empty() || + (Data.first->ReorderIndices.empty() && + Data.first->ReuseShuffleIndices.empty() && + !(IgnoreReorder && + Data.first == VectorizableTree.front().get()))) { + for_each( + Data.second, + [&OrderedEntries](const std::pair &Op) { + OrderedEntries.remove(Op.second); + }); + continue; + } + // Add (potentially!) strided vectorize orders. + for (std::pair &Pair : StridedVectorizeOrders) + OrdersUses.insert(std::make_pair(Pair.first, 0)).first->second += + Pair.second; + } else { + // Account (potentially!) strided vectorize orders only if it was used + // already. + for (std::pair &Pair : StridedVectorizeOrders) { + auto *It = OrdersUses.find(Pair.first); + if (It != OrdersUses.end()) + It->second += Pair.second; + } } // Choose the best order. ArrayRef BestOrder = OrdersUses.front().first; @@ -4802,6 +4877,7 @@ } // Gathers are processed separately. if (TE->State != TreeEntry::Vectorize && + TE->State != TreeEntry::PossibleStridedVectorize && (TE->State != TreeEntry::ScatterVectorize || TE->ReorderIndices.empty())) continue; @@ -4832,7 +4908,8 @@ Data.first->isAltShuffle()) Data.first->reorderOperands(Mask); if (!isa(Data.first->getMainOp()) || - Data.first->isAltShuffle()) { + Data.first->isAltShuffle() || + Data.first->State == TreeEntry::PossibleStridedVectorize) { reorderScalars(Data.first->Scalars, Mask); reorderOrder(Data.first->ReorderIndices, MaskOrder); if (Data.first->ReuseShuffleIndices.empty() && @@ -4893,6 +4970,7 @@ // be used. if (UseScalar != U || UseEntry->State == TreeEntry::ScatterVectorize || + UseEntry->State == TreeEntry::PossibleStridedVectorize || !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) { LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U << ".\n"); @@ -5251,6 +5329,8 @@ return TreeEntry::Vectorize; case LoadsState::ScatterVectorize: return TreeEntry::ScatterVectorize; + case LoadsState::PossibleStridedVectorize: + return TreeEntry::PossibleStridedVectorize; case LoadsState::Gather: #ifndef NDEBUG Type *ScalarTy = VL0->getType(); @@ -5640,7 +5720,8 @@ BasicBlock *BB = nullptr; bool IsScatterVectorizeUserTE = UserTreeIdx.UserTE && - UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize; + (UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize || + UserTreeIdx.UserTE->State == TreeEntry::PossibleStridedVectorize); bool AreAllSameInsts = (S.getOpcode() && allSameBlock(VL)) || (S.OpValue->getType()->isPointerTy() && IsScatterVectorizeUserTE && @@ -5732,7 +5813,8 @@ // Special processing for sorted pointers for ScatterVectorize node with // constant indeces only. if (AreAllSameInsts && UserTreeIdx.UserTE && - UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize && + (UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize || + UserTreeIdx.UserTE->State == TreeEntry::PossibleStridedVectorize) && !(S.getOpcode() && allSameBlock(VL))) { assert(S.OpValue->getType()->isPointerTy() && count_if(VL, [](Value *V) { return isa(V); }) >= @@ -5912,6 +5994,7 @@ // from such a struct, we read/write packed bits disagreeing with the // unvectorized version. TreeEntry *TE = nullptr; + fixupOrderingIndices(CurrentOrder); switch (State) { case TreeEntry::Vectorize: if (CurrentOrder.empty()) { @@ -5920,7 +6003,6 @@ ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n"); } else { - fixupOrderingIndices(CurrentOrder); // Need to reorder. TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies, CurrentOrder); @@ -5928,6 +6010,19 @@ } TE->setOperandsInOrder(); break; + case TreeEntry::PossibleStridedVectorize: + // Vectorizing non-consecutive loads with `llvm.masked.gather`. + if (CurrentOrder.empty()) { + TE = newTreeEntry(VL, TreeEntry::PossibleStridedVectorize, Bundle, S, + UserTreeIdx, ReuseShuffleIndicies); + } else { + TE = newTreeEntry(VL, TreeEntry::PossibleStridedVectorize, Bundle, S, + UserTreeIdx, ReuseShuffleIndicies, CurrentOrder); + } + TE->setOperandsInOrder(); + buildTree_rec(PointerOps, Depth + 1, {TE, 0}); + LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n"); + break; case TreeEntry::ScatterVectorize: // Vectorizing non-consecutive loads with `llvm.masked.gather`. TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S, @@ -6847,6 +6942,7 @@ switch (LS) { case LoadsState::Vectorize: case LoadsState::ScatterVectorize: + case LoadsState::PossibleStridedVectorize: // Mark the vectorized loads so that we don't vectorize them // again. if (LS == LoadsState::Vectorize) @@ -7427,7 +7523,8 @@ } InstructionCost CommonCost = 0; SmallVector Mask; - if (!E->ReorderIndices.empty()) { + if (!E->ReorderIndices.empty() && + E->State != TreeEntry::PossibleStridedVectorize) { SmallVector NewMask; if (E->getOpcode() == Instruction::Store) { // For stores the order is actually a mask. @@ -7444,7 +7541,8 @@ CommonCost = TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FinalVecTy, Mask); assert((E->State == TreeEntry::Vectorize || - E->State == TreeEntry::ScatterVectorize) && + E->State == TreeEntry::ScatterVectorize || + E->State == TreeEntry::PossibleStridedVectorize) && "Unhandled state"); assert(E->getOpcode() && ((allSameType(VL) && allSameBlock(VL)) || @@ -7906,7 +8004,9 @@ Instruction::Load, VecTy, LI0->getAlign(), LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo()); } else { - assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState"); + assert((E->State == TreeEntry::ScatterVectorize || + E->State == TreeEntry::PossibleStridedVectorize) && + "Unknown EntryState"); Align CommonAlignment = LI0->getAlign(); for (Value *V : VL) CommonAlignment = @@ -7921,7 +8021,8 @@ InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost); // If this node generates masked gather load then it is not a terminal node. // Hence address operand cost is estimated separately. - if (E->State == TreeEntry::ScatterVectorize) + if (E->State == TreeEntry::ScatterVectorize || + E->State == TreeEntry::PossibleStridedVectorize) return Cost; // Estimate cost of GEPs since this tree node is a terminator. @@ -8106,7 +8207,8 @@ // Gathering cost would be too much for tiny trees. if (VectorizableTree[0]->State == TreeEntry::NeedToGather || (VectorizableTree[1]->State == TreeEntry::NeedToGather && - VectorizableTree[0]->State != TreeEntry::ScatterVectorize)) + VectorizableTree[0]->State != TreeEntry::ScatterVectorize && + VectorizableTree[0]->State != TreeEntry::PossibleStridedVectorize)) return false; return true; @@ -9682,7 +9784,12 @@ }; Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) { - ArrayRef VL = E->getOperand(NodeIdx); + ValueList &VL = E->getOperand(NodeIdx); + if (E->State == TreeEntry::PossibleStridedVectorize && + !E->ReorderIndices.empty()) { + SmallVector Mask(E->ReorderIndices.begin(), E->ReorderIndices.end()); + reorderScalars(VL, Mask); + } const unsigned VF = VL.size(); InstructionsState S = getSameOpcode(VL, *TLI); // Special processing for GEPs bundle, which may include non-gep values. @@ -10174,6 +10281,8 @@ ArrayRef(reinterpret_cast(E->ReorderIndices.begin()), E->ReorderIndices.size()); ShuffleBuilder.add(V, Mask); + } else if (E->State == TreeEntry::PossibleStridedVectorize) { + ShuffleBuilder.addOrdered(V, std::nullopt); } else { ShuffleBuilder.addOrdered(V, E->ReorderIndices); } @@ -10181,7 +10290,8 @@ }; assert((E->State == TreeEntry::Vectorize || - E->State == TreeEntry::ScatterVectorize) && + E->State == TreeEntry::ScatterVectorize || + E->State == TreeEntry::PossibleStridedVectorize) && "Unhandled state"); unsigned ShuffleOrOp = E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode(); @@ -10572,7 +10682,9 @@ ExternalUses.emplace_back(PO, NewLI, FoundLane); } } else { - assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state"); + assert((E->State == TreeEntry::ScatterVectorize || + E->State == TreeEntry::PossibleStridedVectorize) && + "Unhandled state"); Value *VecPtr = vectorizeOperand(E, 0); if (E->VectorizedValue) { LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads.ll --- a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads.ll @@ -7,7 +7,7 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[A]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, <8 x ptr> [[TMP1]], <8 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, <8 x ptr> [[TMP1]], <8 x i64> ; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> [[TMP2]], i32 1, <8 x i1> , <8 x i8> poison) ; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i8> @llvm.abs.v8i8(<8 x i8> [[TMP3]], i1 false) ; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i8> [[TMP4]] to <8 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll b/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll @@ -30,7 +30,7 @@ ; CHECK-SLP-THRESHOLD: bb: ; CHECK-SLP-THRESHOLD-NEXT: [[TMP0:%.*]] = insertelement <4 x ptr> poison, ptr [[COND_IN_V]], i32 0 ; CHECK-SLP-THRESHOLD-NEXT: [[TMP1:%.*]] = shufflevector <4 x ptr> [[TMP0]], <4 x ptr> poison, <4 x i32> zeroinitializer -; CHECK-SLP-THRESHOLD-NEXT: [[TMP2:%.*]] = getelementptr i64, <4 x ptr> [[TMP1]], <4 x i64> +; CHECK-SLP-THRESHOLD-NEXT: [[TMP2:%.*]] = getelementptr i64, <4 x ptr> [[TMP1]], <4 x i64> ; CHECK-SLP-THRESHOLD-NEXT: [[TMP3:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[TMP2]], i32 8, <4 x i1> , <4 x i64> poison) ; CHECK-SLP-THRESHOLD-NEXT: [[TMP4:%.*]] = icmp eq <4 x i64> [[TMP3]], zeroinitializer ; CHECK-SLP-THRESHOLD-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll @@ -7,7 +7,7 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[ADDR:%.*]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, <8 x ptr> [[TMP1]], <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, <8 x ptr> [[TMP1]], <8 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP2]], i32 8, <8 x i1> , <8 x i32> poison) ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x ptr> poison, ptr [[P:%.*]], i32 0 ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x ptr> [[TMP4]], <8 x ptr> poison, <8 x i32> zeroinitializer