diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -2504,17 +2504,31 @@ /// instruction in the list). Instruction &getLastInstructionInBundle(const TreeEntry *E); - /// Checks if the gathered \p VL can be represented as shuffle(s) of previous - /// tree entries. + /// Checks if the gathered \p VL can be represented as a single register + /// shuffle(s) of previous tree entries. /// \param TE Tree entry checked for permutation. /// \param VL List of scalars (a subset of the TE scalar), checked for - /// permutations. + /// permutations. Must form single-register vector. /// \returns ShuffleKind, if gathered values can be represented as shuffles of - /// previous tree entries. \p Mask is filled with the shuffle mask. + /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask. std::optional - isGatherShuffledEntry(const TreeEntry *TE, ArrayRef VL, - SmallVectorImpl &Mask, - SmallVectorImpl &Entries); + isGatherShuffledSingleRegisterEntry( + const TreeEntry *TE, ArrayRef VL, SmallVectorImpl &Mask, + SmallVectorImpl &Entries, unsigned Part); + + /// Checks if the gathered \p VL can be represented as multi-register + /// shuffle(s) of previous tree entries. + /// \param TE Tree entry checked for permutation. + /// \param VL List of scalars (a subset of the TE scalar), checked for + /// permutations. + /// \returns per-register series of ShuffleKind, if gathered values can be + /// represented as shuffles of previous tree entries. \p Mask is filled with + /// the shuffle mask (also on per-register base). + SmallVector> + isGatherShuffledEntry( + const TreeEntry *TE, ArrayRef VL, SmallVectorImpl &Mask, + SmallVectorImpl> &Entries, + unsigned NumParts); /// \returns the scalarization cost for this list of values. Assuming that /// this subtree gets vectorized, we may need to extract the values from the @@ -6974,6 +6988,11 @@ BoUpSLP &R; SmallPtrSetImpl &CheckedExtracts; constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + /// While set, still trying to estimate the cost for the same nodes and we + /// can delay actual cost estimation (virtual shuffle instruction emission). + /// May help better estimate the cost if same nodes must be permuted + allows + /// to move most of the long shuffles cost estimation to TTI. + bool SameNodesEstimated = true; static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) { if (Ty->getScalarType()->isPointerTy()) { @@ -7214,6 +7233,49 @@ } return Cost; } + /// Transforms mask \p CommonMask per given \p Mask to make proper set after + /// shuffle emission. + static void transformMaskAfterShuffle(MutableArrayRef CommonMask, + ArrayRef Mask) { + for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) + if (Mask[Idx] != PoisonMaskElem) + CommonMask[Idx] = Idx; + } + /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given + /// mask \p Mask, register number \p Part, that includes \p SliceSize + /// elements. + void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2, + ArrayRef Mask, unsigned Part, + unsigned SliceSize) { + if (SameNodesEstimated) { + // Delay the cost estimation if the same nodes are reshuffling. + // If we already requested the cost of reshuffling of E1 and E2 before, no + // need to estimate another cost with the sub-Mask, instead include this + // sub-Mask into the CommonMask to estimate it later and avoid double cost + // estimation. + if ((InVectors.size() == 2 && + InVectors.front().get() == &E1 && + InVectors.back().get() == E2) || + (!E2 && InVectors.front().get() == &E1)) { + assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, SliceSize), + [](int Idx) { return Idx == PoisonMaskElem; }) && + "Expected all poisoned elements."); + ArrayRef SubMask = + ArrayRef(Mask).slice(Part * SliceSize, SliceSize); + copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part)); + return; + } + // Found non-matching nodes - need to estimate the cost for the matched + // and transform mask. + Cost += createShuffle(InVectors.front(), + InVectors.size() == 1 ? nullptr : InVectors.back(), + CommonMask); + transformMaskAfterShuffle(CommonMask, CommonMask); + } + SameNodesEstimated = false; + Cost += createShuffle(&E1, E2, Mask); + transformMaskAfterShuffle(CommonMask, Mask); + } class ShuffleCostBuilder { const TargetTransformInfo &TTI; @@ -7477,31 +7539,74 @@ // into a vector and can be represented as a permutation elements in a // single input vector or of 2 input vectors. Cost += computeExtractCost(VL, Mask, ShuffleKind); + InVectors.assign(1, E); + CommonMask.assign(Mask.begin(), Mask.end()); + transformMaskAfterShuffle(CommonMask, CommonMask); + SameNodesEstimated = false; return VecBase; } - void add(const TreeEntry *E1, const TreeEntry *E2, ArrayRef Mask) { - if (E1 == E2) { + void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef Mask) { + if (&E1 == &E2) { assert(all_of(Mask, - [=](int Idx) { - return Idx < static_cast(E1->getVectorFactor()); + [&](int Idx) { + return Idx < static_cast(E1.getVectorFactor()); }) && "Expected single vector shuffle mask."); add(E1, Mask); return; } - CommonMask.assign(Mask.begin(), Mask.end()); - InVectors.assign({E1, E2}); + if (InVectors.empty()) { + CommonMask.assign(Mask.begin(), Mask.end()); + InVectors.assign({&E1, &E2}); + return; + } + assert(!CommonMask.empty() && "Expected non-empty common mask."); + auto *MaskVecTy = + FixedVectorType::get(E1.Scalars.front()->getType(), Mask.size()); + unsigned NumParts = TTI.getNumberOfParts(MaskVecTy); + assert(NumParts > 0 && NumParts < Mask.size() && + "Expected positive number of registers."); + unsigned SliceSize = Mask.size() / NumParts; + const auto *It = + find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; }); + unsigned Part = std::distance(Mask.begin(), It) / SliceSize; + estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize); } - void add(const TreeEntry *E1, ArrayRef Mask) { - CommonMask.assign(Mask.begin(), Mask.end()); - InVectors.assign(1, E1); + void add(const TreeEntry &E1, ArrayRef Mask) { + if (InVectors.empty()) { + CommonMask.assign(Mask.begin(), Mask.end()); + InVectors.assign(1, &E1); + return; + } + assert(!CommonMask.empty() && "Expected non-empty common mask."); + auto *MaskVecTy = + FixedVectorType::get(E1.Scalars.front()->getType(), Mask.size()); + unsigned NumParts = TTI.getNumberOfParts(MaskVecTy); + assert(NumParts > 0 && NumParts < Mask.size() && + "Expected positive number of registers."); + unsigned SliceSize = Mask.size() / NumParts; + const auto *It = + find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; }); + unsigned Part = std::distance(Mask.begin(), It) / SliceSize; + estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize); + if (!SameNodesEstimated && InVectors.size() == 1) + InVectors.emplace_back(&E1); } /// Adds another one input vector and the mask for the shuffling. void add(Value *V1, ArrayRef Mask) { - assert(CommonMask.empty() && InVectors.empty() && - "Expected empty input mask/vectors."); - CommonMask.assign(Mask.begin(), Mask.end()); - InVectors.assign(1, V1); + if (InVectors.empty()) { + assert(CommonMask.empty() && "Expected empty input mask/vectors."); + CommonMask.assign(Mask.begin(), Mask.end()); + InVectors.assign(1, V1); + return; + } + assert(InVectors.size() == 1 && InVectors.front().is() && + !CommonMask.empty() && "Expected only single entry from extracts."); + InVectors.push_back(V1); + unsigned VF = CommonMask.size(); + for (unsigned Idx = 0; Idx < VF; ++Idx) + if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) + CommonMask[Idx] = Mask[Idx] + VF; } Value *gather(ArrayRef VL, Value *Root = nullptr) { Cost += getBuildVectorCost(VL, Root); @@ -7613,23 +7718,28 @@ SmallVector Mask; SmallVector ExtractMask; std::optional ExtractShuffle; - std::optional GatherShuffle; - SmallVector Entries; + SmallVector> GatherShuffles; + SmallVector> Entries; // Check for gathered extracts. - ExtractShuffle = tryToGatherSingleRegisterExtractElements(GatheredScalars, ExtractMask); - SmallVector IgnoredVals; - if (UserIgnoreList) - IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end()); + ExtractShuffle = + tryToGatherSingleRegisterExtractElements(GatheredScalars, ExtractMask); bool Resized = false; + unsigned NumParts = TTI->getNumberOfParts(VecTy); + if (NumParts == 0 || NumParts >= GatheredScalars.size()) + NumParts = 1; if (Value *VecBase = Estimator.adjustExtracts( - E, ExtractMask, ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc))) + E, ExtractMask, ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc))) { if (auto *VecBaseTy = dyn_cast(VecBase->getType())) if (VF == VecBaseTy->getNumElements() && GatheredScalars.size() != VF) { Resized = true; GatheredScalars.append(VF - GatheredScalars.size(), PoisonValue::get(ScalarTy)); } + } else if (ExtractShuffle && + TTI->getNumberOfParts(VecTy) == VecTy->getNumElements()) { + copy(VL, GatheredScalars.begin()); + } // Do not try to look for reshuffled loads for gathered loads (they will be // handled later), for vectorized scalars, and cases, which are definitely @@ -7639,12 +7749,12 @@ all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) || isSplat(E->Scalars) || (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) - GatherShuffle = isGatherShuffledEntry(E, GatheredScalars, Mask, Entries); - if (GatherShuffle) { - assert((Entries.size() == 1 || Entries.size() == 2) && - "Expected shuffle of 1 or 2 entries."); - if (*GatherShuffle == TTI::SK_PermuteSingleSrc && - Entries.front()->isSame(E->Scalars)) { + GatherShuffles = + isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts); + if (!GatherShuffles.empty()) { + if (GatherShuffles.size() == 1 && + *GatherShuffles.front() == TTI::SK_PermuteSingleSrc && + Entries.front().front()->isSame(E->Scalars)) { // Perfect match in the graph, will reuse the previously vectorized // node. Cost is 0. LLVM_DEBUG( @@ -7658,15 +7768,18 @@ continue; } if (Mask[I] == PoisonMaskElem) - Mask[I] = Entries.front()->findLaneForValue(V); + Mask[I] = Entries.front().front()->findLaneForValue(V); } - Estimator.add(Entries.front(), Mask); + Estimator.add(*Entries.front().front(), Mask); return Estimator.finalize(E->ReuseShuffleIndices); } if (!Resized) { - unsigned VF1 = Entries.front()->getVectorFactor(); - unsigned VF2 = Entries.back()->getVectorFactor(); - if ((VF == VF1 || VF == VF2) && GatheredScalars.size() != VF) + if (GatheredScalars.size() != VF && + any_of(Entries, [&](ArrayRef TEs) { + return any_of(TEs, [&](const TreeEntry *TE) { + return TE->getVectorFactor() == VF; + }); + })) GatheredScalars.append(VF - GatheredScalars.size(), PoisonValue::get(ScalarTy)); } @@ -7678,7 +7791,21 @@ LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size() << " entries for bundle " << shortBundleName(VL) << ".\n"); - Estimator.add(Entries.front(), Entries.back(), Mask); + unsigned SliceSize = E->Scalars.size() / NumParts; + SmallVector VecMask(Mask.size(), PoisonMaskElem); + for (const auto [I, TEs] : enumerate(Entries)) { + if (TEs.empty()) { + assert(!GatherShuffles[I] && + "No shuffles with empty entries list expected."); + continue; + } + assert((TEs.size() == 1 || TEs.size() == 2) && + "Expected shuffle of 1 or 2 entries."); + auto SubMask = ArrayRef(Mask).slice(I * SliceSize, SliceSize); + VecMask.assign(VecMask.size(), PoisonMaskElem); + copy(SubMask, std::next(VecMask.begin(), I * SliceSize)); + Estimator.add(*TEs.front(), *TEs.back(), VecMask); + } if (all_of(GatheredScalars, PoisonValue ::classof)) return Estimator.finalize(E->ReuseShuffleIndices); return Estimator.finalize( @@ -7692,16 +7819,19 @@ if (!all_of(GatheredScalars, PoisonValue::classof)) { auto Gathers = ArrayRef(GatheredScalars).take_front(VL.size()); bool SameGathers = VL.equals(Gathers); - Value *BV = Estimator.gather( - Gathers, SameGathers ? nullptr - : Constant::getNullValue(FixedVectorType::get( - ScalarTy, GatheredScalars.size()))); + if (!SameGathers) + return Estimator.finalize( + E->ReuseShuffleIndices, E->Scalars.size(), + [&](Value *&Vec, SmallVectorImpl &Mask) { + Vec = Estimator.gather( + GatheredScalars, Constant::getNullValue(FixedVectorType::get( + ScalarTy, GatheredScalars.size()))); + }); + Value *BV = Estimator.gather(Gathers); SmallVector ReuseMask(Gathers.size(), PoisonMaskElem); std::iota(ReuseMask.begin(), ReuseMask.end(), 0); Estimator.add(BV, ReuseMask); } - if (ExtractShuffle) - Estimator.add(E, std::nullopt); return Estimator.finalize(E->ReuseShuffleIndices); } InstructionCost CommonCost = 0; @@ -9024,16 +9154,10 @@ } std::optional -BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef VL, - SmallVectorImpl &Mask, - SmallVectorImpl &Entries) { +BoUpSLP::isGatherShuffledSingleRegisterEntry( + const TreeEntry *TE, ArrayRef VL, SmallVectorImpl &Mask, + SmallVectorImpl &Entries, unsigned Part) { Entries.clear(); - // No need to check for the topmost gather node. - if (TE == VectorizableTree.front().get()) - return std::nullopt; - Mask.assign(VL.size(), PoisonMaskElem); - assert(TE->UserTreeIndices.size() == 1 && - "Expected only single user of the gather node."); // TODO: currently checking only for Scalars in the tree entry, need to count // reused elements too for better cost estimation. const EdgeInfo &TEUseEI = TE->UserTreeIndices.front(); @@ -9171,8 +9295,10 @@ } } - if (UsedTEs.empty()) + if (UsedTEs.empty()) { + Entries.clear(); return std::nullopt; + } unsigned VF = 0; if (UsedTEs.size() == 1) { @@ -9325,7 +9451,10 @@ TempEntries.push_back(Entries[I]); } Entries.swap(TempEntries); - if (EntryLanes.size() == Entries.size() && !VL.equals(TE->Scalars)) { + if (EntryLanes.size() == Entries.size() && + !VL.equals(ArrayRef(TE->Scalars) + .slice(Part * VL.size(), + std::min(VL.size(), TE->Scalars.size())))) { // We may have here 1 or 2 entries only. If the number of scalars is equal // to the number of entries, no need to do the analysis, it is not very // profitable. Since VL is not the same as TE->Scalars, it means we already @@ -9338,9 +9467,10 @@ // Pair.first is the offset to the vector, while Pair.second is the index of // scalar in the list. for (const std::pair &Pair : EntryLanes) { - Mask[Pair.second] = Pair.first * VF + - Entries[Pair.first]->findLaneForValue(VL[Pair.second]); - IsIdentity &= Mask[Pair.second] == Pair.second; + unsigned Idx = Part * VL.size() + Pair.second; + Mask[Idx] = Pair.first * VF + + Entries[Pair.first]->findLaneForValue(VL[Pair.second]); + IsIdentity &= Mask[Idx] == Pair.second; } switch (Entries.size()) { case 1: @@ -9358,6 +9488,57 @@ return std::nullopt; } +SmallVector> +BoUpSLP::isGatherShuffledEntry( + const TreeEntry *TE, ArrayRef VL, SmallVectorImpl &Mask, + SmallVectorImpl> &Entries, + unsigned NumParts) { + assert(NumParts > 0 && NumParts < VL.size() && + "Expected positive number of registers."); + Entries.clear(); + // No need to check for the topmost gather node. + if (TE == VectorizableTree.front().get()) + return {}; + Mask.assign(VL.size(), PoisonMaskElem); + assert(TE->UserTreeIndices.size() == 1 && + "Expected only single user of the gather node."); + assert(VL.size() % NumParts == 0 && + "Number of scalars must be divisible by NumParts."); + unsigned SliceSize = VL.size() / NumParts; + SmallVector> Res; + for (unsigned Part = 0; Part < NumParts; ++Part) { + ArrayRef SubVL = VL.slice(Part * SliceSize, SliceSize); + SmallVectorImpl &SubEntries = Entries.emplace_back(); + std::optional SubRes = + isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part); + if (!SubRes) + SubEntries.clear(); + Res.push_back(SubRes); + if (SubEntries.size() == 1 && + SubRes.value_or(TTI::SK_PermuteTwoSrc) == TTI::SK_PermuteSingleSrc && + SubEntries.front()->getVectorFactor() == VL.size() && + (SubEntries.front()->isSame(TE->Scalars) || + SubEntries.front()->isSame(VL))) { + Entries.clear(); + Res.clear(); + std::iota(Mask.begin(), Mask.end(), 0); + // Clear undef scalars. + for (int I = 0, Sz = VL.size(); I < Sz; ++I) + if (isa(VL[I])) + Mask[I] = PoisonMaskElem; + Entries.emplace_back(1, SubEntries.front()); + Res.push_back(TargetTransformInfo::SK_PermuteSingleSrc); + return Res; + } + } + if (all_of(Res, + [](const std::optional &SK) { return !SK; })) { + Entries.clear(); + return {}; + } + return Res; +} + InstructionCost BoUpSLP::getGatherCost(ArrayRef VL, bool ForPoisonSrc) const { // Find the type of the operands in VL. @@ -9824,9 +10005,13 @@ } /// Checks if the specified entry \p E needs to be delayed because of its /// dependency nodes. - Value *needToDelay(const TreeEntry *E, ArrayRef Deps) { + Value *needToDelay(const TreeEntry *E, + ArrayRef> Deps) { // No need to delay emission if all deps are ready. - if (all_of(Deps, [](const TreeEntry *TE) { return TE->VectorizedValue; })) + if (all_of(Deps, [](ArrayRef TEs) { + return all_of( + TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; }); + })) return nullptr; // Postpone gather emission, will be emitted after the end of the // process to keep correct order. @@ -10159,16 +10344,17 @@ SmallVector Mask; SmallVector ExtractMask; std::optional ExtractShuffle; - std::optional GatherShuffle; - SmallVector Entries; + SmallVector> GatherShuffles; + SmallVector> Entries; Type *ScalarTy = GatheredScalars.front()->getType(); + unsigned NumParts = TTI->getNumberOfParts( + FixedVectorType::get(ScalarTy, GatheredScalars.size())); + if (NumParts == 0 || NumParts >= GatheredScalars.size()) + NumParts = 1; if (!all_of(GatheredScalars, UndefValue::classof)) { // Check for gathered extracts. ExtractShuffle = tryToGatherSingleRegisterExtractElements(GatheredScalars, ExtractMask); - SmallVector IgnoredVals; - if (UserIgnoreList) - IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end()); bool Resized = false; if (Value *VecBase = ShuffleBuilder.adjustExtracts(E, ExtractMask)) if (auto *VecBaseTy = dyn_cast(VecBase->getType())) @@ -10183,9 +10369,10 @@ all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) || isSplat(E->Scalars) || (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) { - GatherShuffle = isGatherShuffledEntry(E, GatheredScalars, Mask, Entries); + GatherShuffles = + isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts); } - if (GatherShuffle) { + if (!GatherShuffles.empty()) { if (Value *Delayed = ShuffleBuilder.needToDelay(E, Entries)) { // Delay emission of gathers which are not ready yet. PostponedGathers.insert(E); @@ -10193,10 +10380,9 @@ // process to keep correct order. return Delayed; } - assert((Entries.size() == 1 || Entries.size() == 2) && - "Expected shuffle of 1 or 2 entries."); - if (*GatherShuffle == TTI::SK_PermuteSingleSrc && - Entries.front()->isSame(E->Scalars)) { + if (GatherShuffles.size() == 1 && + *GatherShuffles.front() == TTI::SK_PermuteSingleSrc && + Entries.front().front()->isSame(E->Scalars)) { // Perfect match in the graph, will reuse the previously vectorized // node. Cost is 0. LLVM_DEBUG( @@ -10204,11 +10390,11 @@ << "SLP: perfect diamond match for gather bundle " << shortBundleName(E->Scalars) << ".\n"); // Restore the mask for previous partially matched values. - if (Entries.front()->ReorderIndices.empty() && - ((Entries.front()->ReuseShuffleIndices.empty() && - E->Scalars.size() == Entries.front()->Scalars.size()) || - (E->Scalars.size() == - Entries.front()->ReuseShuffleIndices.size()))) { + const TreeEntry *FrontTE = Entries.front().front(); + if (FrontTE->ReorderIndices.empty() && + ((FrontTE->ReuseShuffleIndices.empty() && + E->Scalars.size() == FrontTE->Scalars.size()) || + (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) { std::iota(Mask.begin(), Mask.end(), 0); } else { for (auto [I, V] : enumerate(E->Scalars)) { @@ -10216,17 +10402,20 @@ Mask[I] = PoisonMaskElem; continue; } - Mask[I] = Entries.front()->findLaneForValue(V); + Mask[I] = FrontTE->findLaneForValue(V); } } - ShuffleBuilder.add(Entries.front()->VectorizedValue, Mask); + ShuffleBuilder.add(FrontTE->VectorizedValue, Mask); Res = ShuffleBuilder.finalize(E->getCommonMask()); return Res; } if (!Resized) { - unsigned VF1 = Entries.front()->getVectorFactor(); - unsigned VF2 = Entries.back()->getVectorFactor(); - if ((VF == VF1 || VF == VF2) && GatheredScalars.size() != VF) + if (GatheredScalars.size() != VF && + any_of(Entries, [&](ArrayRef TEs) { + return any_of(TEs, [&](const TreeEntry *TE) { + return TE->getVectorFactor() == VF; + }); + })) GatheredScalars.append(VF - GatheredScalars.size(), PoisonValue::get(ScalarTy)); } @@ -10326,9 +10515,9 @@ } } }; - if (ExtractShuffle || GatherShuffle) { + if (ExtractShuffle || !GatherShuffles.empty()) { bool IsNonPoisoned = true; - bool IsUsedInExpr = false; + bool IsUsedInExpr = true; Value *Vec1 = nullptr; if (ExtractShuffle) { // Gather of extractelements can be represented as just a shuffle of @@ -10353,31 +10542,48 @@ } } if (Vec2) { + IsUsedInExpr = false; IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1) && isGuaranteedNotToBePoison(Vec2); ShuffleBuilder.add(Vec1, Vec2, ExtractMask); } else if (Vec1) { - IsUsedInExpr = FindReusedSplat(ExtractMask); + IsUsedInExpr &= FindReusedSplat(ExtractMask); ShuffleBuilder.add(Vec1, ExtractMask); IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1); } else { + IsUsedInExpr = false; ShuffleBuilder.add(PoisonValue::get(FixedVectorType::get( ScalarTy, GatheredScalars.size())), ExtractMask); } } - if (GatherShuffle) { - if (Entries.size() == 1) { - IsUsedInExpr = FindReusedSplat(Mask); - ShuffleBuilder.add(Entries.front()->VectorizedValue, Mask); - IsNonPoisoned &= - isGuaranteedNotToBePoison(Entries.front()->VectorizedValue); - } else { - ShuffleBuilder.add(Entries.front()->VectorizedValue, - Entries.back()->VectorizedValue, Mask); - IsNonPoisoned &= - isGuaranteedNotToBePoison(Entries.front()->VectorizedValue) && - isGuaranteedNotToBePoison(Entries.back()->VectorizedValue); + if (!GatherShuffles.empty()) { + unsigned SliceSize = E->Scalars.size() / NumParts; + SmallVector VecMask(Mask.size(), PoisonMaskElem); + for (const auto [I, TEs] : enumerate(Entries)) { + if (TEs.empty()) { + assert(!GatherShuffles[I] && + "No shuffles with empty entries list expected."); + continue; + } + assert((TEs.size() == 1 || TEs.size() == 2) && + "Expected shuffle of 1 or 2 entries."); + auto SubMask = ArrayRef(Mask).slice(I * SliceSize, SliceSize); + VecMask.assign(VecMask.size(), PoisonMaskElem); + copy(SubMask, std::next(VecMask.begin(), I * SliceSize)); + if (TEs.size() == 1) { + IsUsedInExpr &= FindReusedSplat(VecMask); + ShuffleBuilder.add(TEs.front()->VectorizedValue, VecMask); + IsNonPoisoned &= + isGuaranteedNotToBePoison(TEs.front()->VectorizedValue); + } else { + IsUsedInExpr = false; + ShuffleBuilder.add(TEs.front()->VectorizedValue, + TEs.back()->VectorizedValue, VecMask); + IsNonPoisoned &= + isGuaranteedNotToBePoison(TEs.front()->VectorizedValue) && + isGuaranteedNotToBePoison(TEs.back()->VectorizedValue); + } } } // Try to figure out best way to combine values: build a shuffle and insert @@ -10388,14 +10594,18 @@ int MSz = Mask.size(); // Try to build constant vector and shuffle with it only if currently we // have a single permutation and more than 1 scalar constants. - bool IsSingleShuffle = !ExtractShuffle || !GatherShuffle; + bool IsSingleShuffle = !ExtractShuffle || GatherShuffles.empty(); bool IsIdentityShuffle = (ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc) == TTI::SK_PermuteSingleSrc && none_of(ExtractMask, [&](int I) { return I >= EMSz; }) && ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) || - (GatherShuffle.value_or(TTI::SK_PermuteTwoSrc) == - TTI::SK_PermuteSingleSrc && + (!GatherShuffles.empty() && + all_of(GatherShuffles, + [](const std::optional &SK) { + return SK.value_or(TTI::SK_PermuteTwoSrc) == + TTI::SK_PermuteSingleSrc; + }) && none_of(Mask, [&](int I) { return I >= MSz; }) && ShuffleVectorInst::isIdentityMask(Mask, MSz)); bool EnoughConstsForShuffle = diff --git a/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -slp-threshold=-107 | FileCheck %s -; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -slp-threshold=-107 -mattr=+avx2 | FileCheck %s +; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -slp-threshold=-115 | FileCheck %s +; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -slp-threshold=-115 -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 define void @test(i64 %p0, i64 %p1, i64 %p2, i64 %p3) { ; CHECK-LABEL: @test( @@ -14,18 +14,43 @@ ; CHECK-NEXT: [[TMP6:%.*]] = sdiv <4 x i64> [[TMP3]], [[TMP3]] ; CHECK-NEXT: [[TMP7:%.*]] = sub <4 x i64> [[TMP5]], [[TMP6]] ; CHECK-NEXT: [[TMP8:%.*]] = shl <4 x i64> [[TMP4]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> [[TMP6]], <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP11]], <4 x i64> [[TMP6]], <4 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = or <4 x i64> [[TMP10]], [[TMP12]] -; CHECK-NEXT: [[TMP14:%.*]] = trunc <4 x i64> [[TMP13]] to <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> [[TMP10]], <4 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i64> [[TMP12]], <4 x i64> [[TMP13]], <4 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = or <4 x i64> [[TMP11]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = trunc <4 x i64> [[TMP15]] to <4 x i32> ; CHECK-NEXT: br label [[BB:%.*]] ; CHECK: bb: -; CHECK-NEXT: [[TMP15:%.*]] = phi <4 x i32> [ [[TMP16:%.*]], [[BB]] ], [ [[TMP14]], [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP16]] = trunc <4 x i64> [[TMP8]] to <4 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = phi <4 x i32> [ [[TMP18:%.*]], [[BB]] ], [ [[TMP16]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[TMP18]] = trunc <4 x i64> [[TMP8]] to <4 x i32> ; CHECK-NEXT: br label [[BB]] ; +; AVX2-LABEL: @test( +; AVX2-NEXT: entry: +; AVX2-NEXT: [[TMP0:%.*]] = insertelement <4 x i64> poison, i64 [[P0:%.*]], i32 0 +; AVX2-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> [[TMP0]], i64 [[P1:%.*]], i32 1 +; AVX2-NEXT: [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[P2:%.*]], i32 2 +; AVX2-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[P3:%.*]], i32 3 +; AVX2-NEXT: [[TMP4:%.*]] = add <4 x i64> [[TMP3]], [[TMP3]] +; AVX2-NEXT: [[TMP5:%.*]] = mul <4 x i64> [[TMP3]], [[TMP3]] +; AVX2-NEXT: [[TMP6:%.*]] = sdiv <4 x i64> [[TMP3]], [[TMP3]] +; AVX2-NEXT: [[TMP7:%.*]] = sub <4 x i64> [[TMP5]], [[TMP6]] +; AVX2-NEXT: [[TMP8:%.*]] = shl <4 x i64> [[TMP4]], [[TMP7]] +; AVX2-NEXT: [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> +; AVX2-NEXT: [[TMP10:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> [[TMP6]], <4 x i32> +; AVX2-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> +; AVX2-NEXT: [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP11]], <4 x i64> [[TMP6]], <4 x i32> +; AVX2-NEXT: [[TMP13:%.*]] = or <4 x i64> [[TMP10]], [[TMP12]] +; AVX2-NEXT: [[TMP14:%.*]] = trunc <4 x i64> [[TMP13]] to <4 x i32> +; AVX2-NEXT: br label [[BB:%.*]] +; AVX2: bb: +; AVX2-NEXT: [[TMP15:%.*]] = phi <4 x i32> [ [[TMP16:%.*]], [[BB]] ], [ [[TMP14]], [[ENTRY:%.*]] ] +; AVX2-NEXT: [[TMP16]] = trunc <4 x i64> [[TMP8]] to <4 x i32> +; AVX2-NEXT: br label [[BB]] +; entry: %a0 = add i64 %p0, %p0 %a1 = add i64 %p1, %p1