diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -429,26 +429,6 @@ /// i32 6> /// %2 = mul <4 x i8> %1, %1 /// ret <4 x i8> %2 -/// We convert this initially to something like: -/// %x0 = extractelement <4 x i8> %x, i32 0 -/// %x3 = extractelement <4 x i8> %x, i32 3 -/// %y1 = extractelement <4 x i8> %y, i32 1 -/// %y2 = extractelement <4 x i8> %y, i32 2 -/// %1 = insertelement <4 x i8> poison, i8 %x0, i32 0 -/// %2 = insertelement <4 x i8> %1, i8 %x3, i32 1 -/// %3 = insertelement <4 x i8> %2, i8 %y1, i32 2 -/// %4 = insertelement <4 x i8> %3, i8 %y2, i32 3 -/// %5 = mul <4 x i8> %4, %4 -/// %6 = extractelement <4 x i8> %5, i32 0 -/// %ins1 = insertelement <4 x i8> poison, i8 %6, i32 0 -/// %7 = extractelement <4 x i8> %5, i32 1 -/// %ins2 = insertelement <4 x i8> %ins1, i8 %7, i32 1 -/// %8 = extractelement <4 x i8> %5, i32 2 -/// %ins3 = insertelement <4 x i8> %ins2, i8 %8, i32 2 -/// %9 = extractelement <4 x i8> %5, i32 3 -/// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3 -/// ret <4 x i8> %ins4 -/// InstCombiner transforms this into a shuffle and vector mul /// Mask will return the Shuffle Mask equivalent to the extracted elements. /// TODO: Can we split off and reuse the shuffle mask detection from /// ShuffleVectorInst/getShuffleCost? @@ -2455,6 +2435,12 @@ /// \p E. Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx); + /// Create a new vector from a list of scalar values. Produces a sequence + /// which exploits values reused across lanes, and arranges the inserts + /// for ease of later optimization. + template + ResTy processBuildVector(const TreeEntry *E, Args &...Params); + /// Create a new vector from a list of scalar values. Produces a sequence /// which exploits values reused across lanes, and arranges the inserts /// for ease of later optimization. @@ -6598,9 +6584,9 @@ /// Smart shuffle instruction emission, walks through shuffles trees and /// tries to find the best matching vector for the actual shuffle /// instruction. - template - static Value *createShuffle(Value *V1, Value *V2, ArrayRef Mask, - ShuffleBuilderTy &Builder) { + template + static T createShuffle(Value *V1, Value *V2, ArrayRef Mask, + ShuffleBuilderTy &Builder) { assert(V1 && "Expected at least one vector value."); if (V2) Builder.resizeToMatch(V1, V2); @@ -6699,21 +6685,21 @@ isa(Op1) && cast(Op1)->getShuffleMask() == ArrayRef(CombinedMask1)))) - return Op1; + return Builder.createIdentity(Op1); return Builder.createShuffleVector( Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2, CombinedMask1); } if (isa(V1)) - return PoisonValue::get(FixedVectorType::get( - cast(V1->getType())->getElementType(), Mask.size())); + return Builder.createPoison( + cast(V1->getType())->getElementType(), Mask.size()); SmallVector NewMask(Mask.begin(), Mask.end()); bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true); assert(V1 && "Expected non-null value after looking through shuffles."); if (!IsIdentity) return Builder.createShuffleVector(V1, NewMask); - return V1; + return Builder.createIdentity(V1); } }; } // namespace @@ -6728,6 +6714,7 @@ bool IsFinalized = false; SmallVector CommonMask; SmallVector InVectors; + std::optional SingleEntry; const TargetTransformInfo &TTI; InstructionCost Cost = 0; ArrayRef VectorizedVals; @@ -6863,13 +6850,12 @@ /// Compute the cost of creating a vector of type \p VecTy containing the /// extracted values from \p VL. - InstructionCost computeExtractCost(ArrayRef VL, ArrayRef Mask, - TTI::ShuffleKind ShuffleKind) { + InstructionCost computeExtractCost(ArrayRef VL, ArrayRef Mask) { auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size()); unsigned NumOfParts = TTI.getNumberOfParts(VecTy); - if (ShuffleKind != TargetTransformInfo::SK_PermuteSingleSrc || - !NumOfParts || VecTy->getNumElements() < NumOfParts) + constexpr TTI::ShuffleKind ShuffleKind = TTI::SK_PermuteSingleSrc; + if (!NumOfParts || VecTy->getNumElements() < NumOfParts) return TTI.getShuffleCost(ShuffleKind, VecTy, Mask); bool AllConsecutive = true; @@ -6916,13 +6902,66 @@ // cannot re-use the source vector register directly, compute the shuffle // cost to extract the vector with EltsPerVector elements. Cost += TTI.getShuffleCost( - TargetTransformInfo::SK_PermuteSingleSrc, + ShuffleKind, FixedVectorType::get(VecTy->getElementType(), EltsPerVector), RegMask); } return Cost; } + class ShuffleCostBuilder { + const TargetTransformInfo &TTI; + + static bool isEmptyOrIdentity(ArrayRef Mask, unsigned VF) { + int Limit = 2 * VF; + return Mask.empty() || + (VF == Mask.size() && + all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) && + ShuffleVectorInst::isIdentityMask(Mask)); + } + + public: + ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {} + ~ShuffleCostBuilder() = default; + InstructionCost createShuffleVector(Value *V1, Value *, + ArrayRef Mask) const { + // Empty mask or identity mask are free. + unsigned VF = + cast(V1->getType())->getElementCount().getKnownMinValue(); + if (isEmptyOrIdentity(Mask, VF)) + return TTI::TCC_Free; + return TTI.getShuffleCost( + TTI::SK_PermuteTwoSrc, + FixedVectorType::get( + cast(V1->getType())->getElementType(), Mask.size()), + Mask); + } + InstructionCost createShuffleVector(Value *V1, ArrayRef Mask) const { + // Empty mask or identity mask are free. + if (isEmptyOrIdentity(Mask, Mask.size())) + return TTI::TCC_Free; + return TTI.getShuffleCost( + TTI::SK_PermuteSingleSrc, + FixedVectorType::get( + cast(V1->getType())->getElementType(), Mask.size()), + Mask); + } + InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; } + InstructionCost createPoison(Type *Ty, unsigned VF) const { + return TTI::TCC_Free; + } + void resizeToMatch(Value *&, Value *&) const {} + }; + + /// Smart shuffle instruction emission, walks through shuffles trees and + /// tries to find the best matching vector for the actual shuffle + /// instruction. + InstructionCost createShuffle(Value *V1, Value *V2, ArrayRef Mask) { + ShuffleCostBuilder Builder(TTI); + return BaseShuffleAnalysis::createShuffle(V1, V2, Mask, + Builder); + } + public: ShuffleCostEstimator(TargetTransformInfo &TTI, ArrayRef VectorizedVals, BoUpSLP &R, @@ -6938,10 +6977,8 @@ auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size()); // If the resulting type is scalarized, do not adjust the cost. unsigned VecNumParts = TTI.getNumberOfParts(VecTy); - if (VecNumParts == VecTy->getNumElements()) { - InVectors.assign(1, Constant::getNullValue(VecTy)); + if (VecNumParts == VecTy->getNumElements()) return nullptr; - } DenseMap ExtractVectorsTys; for (auto [I, V] : enumerate(VL)) { // Ignore non-extractelement scalars. @@ -7017,54 +7054,199 @@ VecTy, std::nullopt, CostKind, 0, EEVTy); } } - // Check that gather of extractelements can be represented as just a - // shuffle of a single/two vectors the scalars are extracted from. - // Found the bunch of extractelement instructions that must be gathered - // into a vector and can be represented as a permutation elements in a - // single input vector or of 2 input vectors. - Cost += computeExtractCost(VL, Mask, ShuffleKind); - InVectors.assign(1, Constant::getNullValue(VecTy)); + if (ShuffleKind == TTI::SK_PermuteSingleSrc) + SingleEntry = E; return VecBase; } + std::optional + needToDelay(const TreeEntry *, ArrayRef) const { + // No need to delay the cost estimation during analysis. + return std::nullopt; + } + InstructionCost getSameNode(const TreeEntry *E) { return TTI::TCC_Free; } void add(const TreeEntry *E1, const TreeEntry *E2, ArrayRef Mask) { - CommonMask.assign(Mask.begin(), Mask.end()); - InVectors.assign( - 2, Constant::getNullValue(FixedVectorType::get( - E1->Scalars.front()->getType(), - std::max(E1->getVectorFactor(), E2->getVectorFactor())))); + // Use zeroinitializer instead of actual vector value here, since they are + // not ready yet. + add(Constant::getNullValue(FixedVectorType::get( + E1->Scalars.front()->getType(), E1->getVectorFactor())), + Constant::getNullValue(FixedVectorType::get( + E2->Scalars.front()->getType(), E2->getVectorFactor())), + Mask); } void add(const TreeEntry *E1, ArrayRef Mask) { - CommonMask.assign(Mask.begin(), Mask.end()); - InVectors.assign( - 1, Constant::getNullValue(FixedVectorType::get( - E1->Scalars.front()->getType(), E1->getVectorFactor()))); + // Use zeroinitializer instead of actual vector value here, since they are + // not ready yet. + add(Constant::getNullValue(FixedVectorType::get( + E1->Scalars.front()->getType(), E1->getVectorFactor())), + Mask); + } + /// Adds 2 input vectors and the mask for their shuffling. + void add(Value *V1, Value *V2, ArrayRef Mask) { + assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors."); + SingleEntry = nullptr; + if (InVectors.empty()) { + InVectors.push_back(V1); + InVectors.push_back(V2); + CommonMask.assign(Mask.begin(), Mask.end()); + return; + } + Value *Vec = InVectors.front(); + if (InVectors.size() == 2) { + Cost += createShuffle(Vec, InVectors.back(), CommonMask); + for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) + if (Mask[Idx] != UndefMaskElem) + CommonMask[Idx] = Idx; + } else if (cast(Vec->getType())->getNumElements() != + Mask.size()) { + Cost += createShuffle(Vec, nullptr, CommonMask); + for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) + if (Mask[Idx] != UndefMaskElem) + CommonMask[Idx] = Idx; + } + Cost += createShuffle(V1, V2, Mask); + for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) + if (Mask[Idx] != UndefMaskElem) + CommonMask[Idx] = Idx + Sz; + InVectors.front() = Vec; + if (InVectors.size() == 2) + InVectors.back() = V1; + else + InVectors.push_back(V1); + } + /// Adds another one input vector and the mask for the shuffling. + void add(Value *V1, ArrayRef Mask) { + if (InVectors.empty()) { + if (!isa(V1->getType())) { + Cost += createShuffle(V1, nullptr, CommonMask); + CommonMask.assign(Mask.size(), UndefMaskElem); + for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) + if (Mask[Idx] != UndefMaskElem) + CommonMask[Idx] = Idx; + } + InVectors.push_back(V1); + CommonMask.assign(Mask.begin(), Mask.end()); + return; + } + SingleEntry = nullptr; + const auto *It = find(InVectors, V1); + if (It == InVectors.end()) { + if (InVectors.size() == 2 || + InVectors.front()->getType() != V1->getType() || + !isa(V1->getType())) { + Value *V = InVectors.front(); + if (InVectors.size() == 2) { + Cost += + createShuffle(InVectors.front(), InVectors.back(), CommonMask); + for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) + if (CommonMask[Idx] != UndefMaskElem) + CommonMask[Idx] = Idx; + } else if (cast(V->getType())->getNumElements() != + CommonMask.size()) { + Cost += createShuffle(InVectors.front(), nullptr, CommonMask); + for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) + if (CommonMask[Idx] != UndefMaskElem) + CommonMask[Idx] = Idx; + } + for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) + if (CommonMask[Idx] == UndefMaskElem && Mask[Idx] != UndefMaskElem) + CommonMask[Idx] = + V->getType() != V1->getType() + ? Idx + Sz + : Mask[Idx] + cast(V1->getType()) + ->getNumElements(); + if (V->getType() != V1->getType()) + Cost += createShuffle(V1, nullptr, Mask); + InVectors.front() = V; + if (InVectors.size() == 2) + InVectors.back() = V1; + else + InVectors.push_back(V1); + return; + } + // Check if second vector is required if the used elements are already + // used from the first one. + for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) + if (Mask[Idx] != UndefMaskElem && CommonMask[Idx] == UndefMaskElem) { + InVectors.push_back(V1); + break; + } + } + int VF = CommonMask.size(); + if (auto *FTy = dyn_cast(V1->getType())) + VF = FTy->getNumElements(); + for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) + if (Mask[Idx] != UndefMaskElem && CommonMask[Idx] == UndefMaskElem) + CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF); } - void gather(ArrayRef VL, Value *Root = nullptr) { + /// Adds another one input vector and the mask for the shuffling. + void addOrdered(Value *V1, ArrayRef Order) { + SmallVector NewMask; + inversePermutation(Order, NewMask); + add(V1, NewMask); + SingleEntry = nullptr; + } + Value *gather(ArrayRef VL, Value *Root = nullptr) { Cost += getBuildVectorCost(VL, Root); if (!Root) { - assert(InVectors.empty() && "Unexpected input vectors for buildvector."); - InVectors.assign(1, Constant::getNullValue(FixedVectorType::get( - VL.front()->getType(), VL.size()))); + SmallVector Vals; + for (Value *V : VL) { + if (isa(V)) { + Vals.push_back(cast(V)); + continue; + } + Vals.push_back(Constant::getNullValue(V->getType())); + } + return ConstantVector::get(Vals); } + return ConstantVector::getSplat( + ElementCount::getFixed(VL.size()), + Constant::getNullValue(VL.front()->getType())); } + InstructionCost createFreeze(InstructionCost Cost) { return Cost; } /// Finalize emission of the shuffles. - InstructionCost finalize(ArrayRef ExtMask) { + InstructionCost + finalize(ArrayRef ExtMask, unsigned VF = 0, + function_ref &)> Action = {}) { IsFinalized = true; + if (Action) { + Value *Vec = InVectors.front(); + if (InVectors.size() == 2) { + Cost += createShuffle(Vec, InVectors.back(), CommonMask); + InVectors.pop_back(); + } else { + Cost += createShuffle(Vec, nullptr, CommonMask); + } + for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) + if (CommonMask[Idx] != UndefMaskElem) + CommonMask[Idx] = Idx; + assert(VF > 0 && + "Expected vector length for the final value before action."); + Action(Vec, CommonMask); + InVectors.front() = Vec; + SingleEntry = nullptr; + } ::addMask(CommonMask, ExtMask, /*ExtendingManyInputs=*/true); - if (CommonMask.empty()) + if (CommonMask.empty()) { + assert(InVectors.size() == 1 && "Expected only one vector with no mask"); return Cost; + } + if (InVectors.size() == 2) + return Cost + + createShuffle(InVectors.front(), InVectors.back(), CommonMask); int Limit = CommonMask.size() * 2; if (all_of(CommonMask, [=](int Idx) { return Idx < Limit; }) && ShuffleVectorInst::isIdentityMask(CommonMask)) return Cost; - return Cost + - TTI.getShuffleCost(InVectors.size() == 2 ? TTI::SK_PermuteTwoSrc - : TTI::SK_PermuteSingleSrc, - FixedVectorType::get( - cast(InVectors.front()->getType()) - ->getElementType(), - CommonMask.size()), - CommonMask); + if (const TreeEntry *TE = SingleEntry.value_or(nullptr); + TE && CommonMask.size() == TE->Scalars.size()) { + // Check that gather of extractelements can be represented as just a + // shuffle of a single/two vectors the scalars are extracted from. + // Found the bunch of extractelement instructions that must be gathered + // into a vector and can be represented as a permutation elements in a + // single input vector or of 2 input vectors. + return Cost + computeExtractCost(TE->Scalars, CommonMask); + } + return Cost + createShuffle(InVectors.front(), nullptr, CommonMask); } ~ShuffleCostEstimator() { @@ -7102,94 +7284,8 @@ return 0; if (isa(VL[0])) return InstructionCost::getInvalid(); - ShuffleCostEstimator Estimator(*TTI, VectorizedVals, *this, - CheckedExtracts); - unsigned VF = E->getVectorFactor(); - SmallVector ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(), - E->ReuseShuffleIndices.end()); - SmallVector GatheredScalars(E->Scalars.begin(), E->Scalars.end()); - // Build a mask out of the reorder indices and reorder scalars per this - // mask. - SmallVector ReorderMask; - inversePermutation(E->ReorderIndices, ReorderMask); - if (!ReorderMask.empty()) - reorderScalars(GatheredScalars, ReorderMask); - SmallVector Mask; - SmallVector ExtractMask; - std::optional ExtractShuffle; - std::optional GatherShuffle; - SmallVector Entries; - Type *ScalarTy = GatheredScalars.front()->getType(); - // Check for gathered extracts. - ExtractShuffle = tryToGatherExtractElements(GatheredScalars, ExtractMask); - SmallVector IgnoredVals; - if (UserIgnoreList) - IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end()); - - bool Resized = false; - if (Value *VecBase = Estimator.adjustExtracts( - E, ExtractMask, ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc))) - if (auto *VecBaseTy = dyn_cast(VecBase->getType())) - if (VF == VecBaseTy->getNumElements() && GatheredScalars.size() != VF) { - Resized = true; - GatheredScalars.append(VF - GatheredScalars.size(), - PoisonValue::get(ScalarTy)); - } - - // Do not try to look for reshuffled loads for gathered loads (they will be - // handled later), for vectorized scalars, and cases, which are definitely - // not profitable (splats and small gather nodes.) - if (ExtractShuffle || E->getOpcode() != Instruction::Load || - E->isAltShuffle() || - all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) || - isSplat(E->Scalars) || - (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) - GatherShuffle = isGatherShuffledEntry(E, GatheredScalars, Mask, Entries); - if (GatherShuffle) { - assert((Entries.size() == 1 || Entries.size() == 2) && - "Expected shuffle of 1 or 2 entries."); - if (*GatherShuffle == TTI::SK_PermuteSingleSrc && - Entries.front()->isSame(E->Scalars)) { - // Perfect match in the graph, will reuse the previously vectorized - // node. Cost is 0. - LLVM_DEBUG( - dbgs() - << "SLP: perfect diamond match for gather bundle that starts with " - << *VL.front() << ".\n"); - return 0; - } - if (!Resized) { - unsigned VF1 = Entries.front()->getVectorFactor(); - unsigned VF2 = Entries.back()->getVectorFactor(); - if ((VF == VF1 || VF == VF2) && GatheredScalars.size() != VF) - GatheredScalars.append(VF - GatheredScalars.size(), - PoisonValue::get(ScalarTy)); - } - // Remove shuffled elements from list of gathers. - for (int I = 0, Sz = Mask.size(); I < Sz; ++I) { - if (Mask[I] != UndefMaskElem) - GatheredScalars[I] = PoisonValue::get(ScalarTy); - } - LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size() - << " entries for bundle that starts with " - << *VL.front() << ".\n";); - if (Entries.size() == 1) - Estimator.add(Entries.front(), Mask); - else - Estimator.add(Entries.front(), Entries.back(), Mask); - Estimator.gather( - GatheredScalars, - Constant::getNullValue(FixedVectorType::get( - GatheredScalars.front()->getType(), GatheredScalars.size()))); - return Estimator.finalize(E->ReuseShuffleIndices); - } - Estimator.gather( - GatheredScalars, - VL.equals(GatheredScalars) - ? nullptr - : Constant::getNullValue(FixedVectorType::get( - GatheredScalars.front()->getType(), GatheredScalars.size()))); - return Estimator.finalize(E->ReuseShuffleIndices); + return processBuildVector( + E, *TTI, VectorizedVals, *this, CheckedExtracts); } InstructionCost CommonCost = 0; SmallVector Mask; @@ -9171,6 +9267,10 @@ } return Vec; } + Value *createIdentity(Value *V) { return V; } + Value *createPoison(Type *Ty, unsigned VF) { + return PoisonValue::get(FixedVectorType::get(Ty, VF)); + } /// Resizes 2 input vector to match the sizes, if the they are not equal /// yet. The smallest vector is resized to the size of the larger vector. void resizeToMatch(Value *&V1, Value *&V2) { @@ -9203,7 +9303,8 @@ assert(V1 && "Expected at least one vector value."); ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq, R.CSEBlocks); - return BaseShuffleAnalysis::createShuffle(V1, V2, Mask, ShuffleBuilder); + return BaseShuffleAnalysis::createShuffle(V1, V2, Mask, + ShuffleBuilder); } /// Transforms mask \p CommonMask per given \p Mask to make proper set after @@ -9220,7 +9321,8 @@ : Builder(Builder), R(R) {} /// Adjusts extractelements after reusing them. - Value *adjustExtracts(const TreeEntry *E, ArrayRef Mask) { + Value *adjustExtracts(const TreeEntry *E, ArrayRef Mask, + TTI::ShuffleKind) { Value *VecBase = nullptr; for (int I = 0, Sz = Mask.size(); I < Sz; ++I) { int Idx = Mask[I]; @@ -9240,10 +9342,11 @@ } /// Checks if the specified entry \p E needs to be delayed because of its /// dependency nodes. - Value *needToDelay(const TreeEntry *E, ArrayRef Deps) { + std::optional needToDelay(const TreeEntry *E, + ArrayRef Deps) const { // No need to delay emission if all deps are ready. if (all_of(Deps, [](const TreeEntry *TE) { return TE->VectorizedValue; })) - return nullptr; + return std::nullopt; // Postpone gather emission, will be emitted after the end of the // process to keep correct order. auto *VecTy = FixedVectorType::get(E->Scalars.front()->getType(), @@ -9252,6 +9355,13 @@ VecTy, PoisonValue::get(VecTy->getPointerTo()), MaybeAlign()); return Vec; } + Value *getSameNode(const TreeEntry *E) { return E->VectorizedValue; } + void add(const TreeEntry *E1, const TreeEntry *E2, ArrayRef Mask) { + add(E1->VectorizedValue, E2->VectorizedValue, Mask); + } + void add(const TreeEntry *E1, ArrayRef Mask) { + add(E1->VectorizedValue, Mask); + } /// Adds 2 input vectors and the mask for their shuffling. void add(Value *V1, Value *V2, ArrayRef Mask) { assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors."); @@ -9343,6 +9453,10 @@ inversePermutation(Order, NewMask); add(V1, NewMask); } + Value *gather(ArrayRef VL, Value *Root = nullptr) { + return R.gather(VL, Root); + } + Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); } /// Finalize emission of the shuffles. /// \param Action the action (if any) to be performed before final applying of /// the \p ExtMask mask. @@ -9511,7 +9625,8 @@ return vectorizeTree(I->get()); } -Value *BoUpSLP::createBuildVector(const TreeEntry *E) { +template +ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) { assert(E->State == TreeEntry::NeedToGather && "Expected gather node."); unsigned VF = E->getVectorFactor(); @@ -9552,8 +9667,8 @@ std::fill(Mask.begin(), Mask.end(), I); return true; }; - ShuffleInstructionBuilder ShuffleBuilder(Builder, *this); - Value *Vec = nullptr; + BVTy ShuffleBuilder(Params...); + ResTy Res = ResTy(); SmallVector Mask; SmallVector ExtractMask; std::optional ExtractShuffle; @@ -9567,7 +9682,8 @@ if (UserIgnoreList) IgnoredVals.assign(UserIgnoreList->begin(), UserIgnoreList->end()); bool Resized = false; - if (Value *VecBase = ShuffleBuilder.adjustExtracts(E, ExtractMask)) + if (Value *VecBase = ShuffleBuilder.adjustExtracts( + E, ExtractMask, ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc))) if (auto *VecBaseTy = dyn_cast(VecBase->getType())) if (VF == VecBaseTy->getNumElements() && GatheredScalars.size() != VF) { Resized = true; @@ -9583,12 +9699,13 @@ GatherShuffle = isGatherShuffledEntry(E, GatheredScalars, Mask, Entries); } if (GatherShuffle) { - if (Value *Delayed = ShuffleBuilder.needToDelay(E, Entries)) { + if (std::optional Delayed = + ShuffleBuilder.needToDelay(E, Entries)) { // Delay emission of gathers which are not ready yet. PostponedGathers.insert(E); // Postpone gather emission, will be emitted after the end of the // process to keep correct order. - return Delayed; + return *Delayed; } assert((Entries.size() == 1 || Entries.size() == 2) && "Expected shuffle of 1 or 2 entries."); @@ -9600,18 +9717,8 @@ dbgs() << "SLP: perfect diamond match for gather bundle that starts with " << *E->Scalars.front() << ".\n"); - // Restore the mask for previous partially matched values. - for (auto [I, V] : enumerate(E->Scalars)) { - if (isa(V)) { - Mask[I] = UndefMaskElem; - continue; - } - if (Mask[I] == UndefMaskElem) - Mask[I] = Entries.front()->findLaneForValue(V); - } - ShuffleBuilder.add(Entries.front()->VectorizedValue, Mask); - Vec = ShuffleBuilder.finalize(E->ReuseShuffleIndices); - return Vec; + Res = ShuffleBuilder.getSameNode(Entries.front()); + return Res; } if (!Resized) { unsigned VF1 = Entries.front()->getVectorFactor(); @@ -9759,15 +9866,16 @@ if (GatherShuffle) { if (Entries.size() == 1) { IsUsedInExpr = FindReusedSplat(Mask); - ShuffleBuilder.add(Entries.front()->VectorizedValue, Mask); - IsNonPoisoned &= - isGuaranteedNotToBePoison(Entries.front()->VectorizedValue); + ShuffleBuilder.add(Entries.front(), Mask); + if (Entries.front()->VectorizedValue) + IsNonPoisoned &= + isGuaranteedNotToBePoison(Entries.front()->VectorizedValue); } else { - ShuffleBuilder.add(Entries.front()->VectorizedValue, - Entries.back()->VectorizedValue, Mask); - IsNonPoisoned &= - isGuaranteedNotToBePoison(Entries.front()->VectorizedValue) && - isGuaranteedNotToBePoison(Entries.back()->VectorizedValue); + ShuffleBuilder.add(Entries.front(), Entries.back(), Mask); + if (Entries.front()->VectorizedValue && Entries.back()->VectorizedValue) + IsNonPoisoned &= + isGuaranteedNotToBePoison(Entries.front()->VectorizedValue) && + isGuaranteedNotToBePoison(Entries.back()->VectorizedValue); } } // Try to figure out best way to combine values: build a shuffle and insert @@ -9817,7 +9925,7 @@ if (!all_of(GatheredScalars, PoisonValue::classof)) { SmallVector BVMask(GatheredScalars.size(), UndefMaskElem); TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true); - Value *BV = gather(GatheredScalars); + Value *BV = ShuffleBuilder.gather(GatheredScalars); ShuffleBuilder.add(BV, BVMask); } if (all_of(NonConstants, [=](Value *V) { @@ -9825,21 +9933,21 @@ (IsSingleShuffle && ((IsIdentityShuffle && IsNonPoisoned) || IsUsedInExpr) && isa(V)); })) - Vec = ShuffleBuilder.finalize(E->ReuseShuffleIndices); + Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices); else - Vec = ShuffleBuilder.finalize( + Res = ShuffleBuilder.finalize( E->ReuseShuffleIndices, E->Scalars.size(), [&](Value *&Vec, SmallVectorImpl &Mask) { TryPackScalars(NonConstants, Mask, /*IsRootPoison=*/false); - Vec = gather(NonConstants, Vec); + Vec = ShuffleBuilder.gather(NonConstants, Vec); }); } else if (!allConstant(GatheredScalars)) { // Gather unique scalars and all constants. SmallVector ReuseMask(GatheredScalars.size(), UndefMaskElem); TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true); - Vec = gather(GatheredScalars); - ShuffleBuilder.add(Vec, ReuseMask); - Vec = ShuffleBuilder.finalize(E->ReuseShuffleIndices); + Value *BV = ShuffleBuilder.gather(GatheredScalars); + ShuffleBuilder.add(BV, ReuseMask); + Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices); } else { // Gather all constants. SmallVector Mask(E->Scalars.size(), UndefMaskElem); @@ -9847,14 +9955,19 @@ if (!isa(V)) Mask[I] = I; } - Vec = gather(E->Scalars); - ShuffleBuilder.add(Vec, Mask); - Vec = ShuffleBuilder.finalize(E->ReuseShuffleIndices); + Value *BV = ShuffleBuilder.gather(E->Scalars); + ShuffleBuilder.add(BV, Mask); + Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices); } if (NeedFreeze) - Vec = Builder.CreateFreeze(Vec); - return Vec; + Res = ShuffleBuilder.createFreeze(Res); + return Res; +} + +Value *BoUpSLP::createBuildVector(const TreeEntry *E) { + return processBuildVector(E, Builder, + *this); } Value *BoUpSLP::vectorizeTree(TreeEntry *E) { diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/slp-vectorizer/merge-scalars.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/slp-vectorizer/merge-scalars.ll --- a/llvm/test/DebugInfo/Generic/assignment-tracking/slp-vectorizer/merge-scalars.ll +++ b/llvm/test/DebugInfo/Generic/assignment-tracking/slp-vectorizer/merge-scalars.ll @@ -23,10 +23,11 @@ ;; the vector store that replaces them. ; CHECK: call void @llvm.dbg.assign(metadata float undef, metadata ![[VAR:[0-9]+]], metadata !DIExpression(DW_OP_LLVM_fragment, 0, 32), metadata ![[ID:[0-9]+]], metadata ptr %arrayidx, metadata !DIExpression()) +; CHECK: store <2 x float> {{.*}} !DIAssignID ![[ID]] ; CHECK: call void @llvm.dbg.assign(metadata float undef, metadata ![[VAR]], metadata !DIExpression(DW_OP_LLVM_fragment, 32, 32), metadata ![[ID]], metadata ptr %quad, metadata !DIExpression(DW_OP_plus_uconst, 4)) -; CHECK: call void @llvm.dbg.assign(metadata float undef, metadata ![[VAR]], metadata !DIExpression(DW_OP_LLVM_fragment, 64, 32), metadata ![[ID]], metadata ptr %quad, metadata !DIExpression(DW_OP_plus_uconst, 8)) -; CHECK: store <4 x float> {{.*}} !DIAssignID ![[ID]] -; CHECK: call void @llvm.dbg.assign(metadata float undef, metadata ![[VAR]], metadata !DIExpression(DW_OP_LLVM_fragment, 96, 32), metadata ![[ID]], metadata ptr %quad, metadata !DIExpression(DW_OP_plus_uconst, 12)) +; CHECK: call void @llvm.dbg.assign(metadata float undef, metadata ![[VAR]], metadata !DIExpression(DW_OP_LLVM_fragment, 64, 32), metadata ![[ID1:[0-9]+]], metadata ptr %arrayidx7, metadata !DIExpression()) +; CHECK: store <2 x float> {{.*}} !DIAssignID ![[ID1]] +; CHECK: call void @llvm.dbg.assign(metadata float undef, metadata ![[VAR]], metadata !DIExpression(DW_OP_LLVM_fragment, 96, 32), metadata ![[ID1]], metadata ptr %quad, metadata !DIExpression(DW_OP_plus_uconst, 12)) target triple = "x86_64-unknown-unknown" diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll @@ -3,27 +3,21 @@ define void @test(<2 x i64> %0, <2 x i64> %1, <2 x i64> %2) { ; CHECK-LABEL: @test( -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP1:%.*]], i64 0 -; CHECK-NEXT: [[TMP5:%.*]] = or i64 [[TMP4]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP0:%.*]], i64 0 -; CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP7]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = trunc i64 [[TMP8]] to i32 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP2:%.*]], i64 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1 -; CHECK-NEXT: [[TMP12:%.*]] = or i64 [[TMP10]], [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i32 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP0]], i64 0 -; CHECK-NEXT: [[TMP15:%.*]] = or i64 [[TMP14]], 0 -; CHECK-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32 -; CHECK-NEXT: br label [[TMP17:%.*]] -; CHECK: 17: -; CHECK-NEXT: [[TMP18:%.*]] = phi i32 [ [[TMP22:%.*]], [[TMP17]] ], [ [[TMP6]], [[TMP3:%.*]] ] -; CHECK-NEXT: [[TMP19:%.*]] = phi i32 [ 0, [[TMP17]] ], [ [[TMP9]], [[TMP3]] ] -; CHECK-NEXT: [[TMP20:%.*]] = phi i32 [ 0, [[TMP17]] ], [ [[TMP13]], [[TMP3]] ] -; CHECK-NEXT: [[TMP21:%.*]] = phi i32 [ 0, [[TMP17]] ], [ [[TMP16]], [[TMP3]] ] -; CHECK-NEXT: [[TMP22]] = or i32 [[TMP18]], 0 -; CHECK-NEXT: br label [[TMP17]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2:%.*]], i64 0 +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP1:%.*]], <2 x i64> [[TMP0:%.*]], <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i64> [[TMP5]], i64 [[TMP4]], i32 2 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> , <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = or <4 x i64> [[TMP6]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i32> +; CHECK-NEXT: br label [[TMP11:%.*]] +; CHECK: 11: +; CHECK-NEXT: [[TMP12:%.*]] = phi <4 x i32> [ [[TMP16:%.*]], [[TMP11]] ], [ [[TMP10]], [[TMP3:%.*]] ] +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> , <4 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = or <4 x i32> zeroinitializer, [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = add <4 x i32> zeroinitializer, [[TMP13]] +; CHECK-NEXT: [[TMP16]] = shufflevector <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], <4 x i32> +; CHECK-NEXT: br label [[TMP11]] ; %4 = extractelement <2 x i64> %1, i64 0 %5 = or i64 %4, 0 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll @@ -30,14 +30,14 @@ define void @store_chain_v2i64(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: @store_chain_v2i64( -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i64>, ptr [[B:%.*]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i64> [[TMP8]], [[TMP7]] -; CHECK-NEXT: store <2 x i64> [[TMP9]], ptr [[C:%.*]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[B:%.*]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = sub <2 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i64> [[TMP6]], [[TMP5]] +; CHECK-NEXT: store <2 x i64> [[TMP7]], ptr [[C:%.*]], align 8 ; CHECK-NEXT: ret void ; %a.1 = getelementptr i64, ptr %a, i64 1 @@ -101,8 +101,8 @@ ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i32> [[TMP4]], [[TMP3]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: ret <4 x i32> [[SHUFFLE]] +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[TMP6]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 %v0.1 = extractelement <2 x i32> %v0, i32 1 @@ -167,10 +167,10 @@ ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i32> [[TMP4]], [[TMP3]] ; CHECK-NEXT: [[TMP6:%.*]] = xor <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = xor <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i32> [[SHUFFLE]], [[TMP7]] -; CHECK-NEXT: [[TMP3_31:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> [[TMP8]], <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i32> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP3_31:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> [[TMP9]], <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[TMP3_31]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll @@ -30,14 +30,14 @@ define void @store_chain_v2i64(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: @store_chain_v2i64( -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i64>, ptr [[B:%.*]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i64> [[TMP8]], [[TMP7]] -; CHECK-NEXT: store <2 x i64> [[TMP9]], ptr [[C:%.*]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[B:%.*]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = sub <2 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i64> [[TMP6]], [[TMP5]] +; CHECK-NEXT: store <2 x i64> [[TMP7]], ptr [[C:%.*]], align 8 ; CHECK-NEXT: ret void ; %a.1 = getelementptr i64, ptr %a, i64 1 @@ -101,8 +101,8 @@ ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i32> [[TMP4]], [[TMP3]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: ret <4 x i32> [[SHUFFLE]] +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[TMP6]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 %v0.1 = extractelement <2 x i32> %v0, i32 1 @@ -167,10 +167,10 @@ ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i32> [[TMP4]], [[TMP3]] ; CHECK-NEXT: [[TMP6:%.*]] = xor <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = xor <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i32> [[SHUFFLE]], [[TMP7]] -; CHECK-NEXT: [[TMP3_31:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> [[TMP8]], <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i32> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP3_31:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> [[TMP9]], <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[TMP3_31]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll @@ -18,19 +18,20 @@ define void @s116_modified(ptr %a) { ; CHECK-LABEL: @s116_modified( ; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 1 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 2 ; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 3 ; CHECK-NEXT: [[LD0:%.*]] = load float, ptr [[A]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[GEP1]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr [[GEP3]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> poison, float [[LD0]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> [[TMP8]], <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> [[TMP8]], <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = fmul fast <4 x float> [[TMP9]], [[TMP11]] -; CHECK-NEXT: store <4 x float> [[TMP12]], ptr [[A]], align 4 +; CHECK-NEXT: [[LD1:%.*]] = load float, ptr [[GEP1]], align 4 +; CHECK-NEXT: [[LD2:%.*]] = load float, ptr [[GEP2]], align 4 +; CHECK-NEXT: [[MUL0:%.*]] = fmul fast float [[LD0]], [[LD1]] +; CHECK-NEXT: [[MUL1:%.*]] = fmul fast float [[LD2]], [[LD1]] +; CHECK-NEXT: store float [[MUL0]], ptr [[A]], align 4 +; CHECK-NEXT: store float [[MUL1]], ptr [[GEP1]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP3]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[LD2]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP1]], <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP3]] +; CHECK-NEXT: store <2 x float> [[TMP4]], ptr [[GEP2]], align 4 ; CHECK-NEXT: ret void ; %gep1 = getelementptr inbounds float, ptr %a, i64 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll b/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll @@ -16,13 +16,28 @@ define void @splat(i8 %a, i8 %b, i8 %c) { ; SSE-LABEL: @splat( -; SSE-NEXT: [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[A:%.*]], i32 0 -; SSE-NEXT: [[TMP2:%.*]] = insertelement <16 x i8> [[TMP1]], i8 [[B:%.*]], i32 1 -; SSE-NEXT: [[TMP3:%.*]] = shufflevector <16 x i8> [[TMP2]], <16 x i8> poison, <16 x i32> -; SSE-NEXT: [[TMP4:%.*]] = insertelement <16 x i8> poison, i8 [[C:%.*]], i32 0 -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <16 x i8> [[TMP4]], <16 x i8> poison, <16 x i32> zeroinitializer -; SSE-NEXT: [[TMP6:%.*]] = xor <16 x i8> [[TMP3]], [[TMP5]] -; SSE-NEXT: store <16 x i8> [[TMP6]], ptr @cle, align 16 +; SSE-NEXT: [[TMP1:%.*]] = xor i8 [[C:%.*]], [[A:%.*]] +; SSE-NEXT: store i8 [[TMP1]], ptr @cle, align 16 +; SSE-NEXT: [[TMP2:%.*]] = xor i8 [[A]], [[C]] +; SSE-NEXT: store i8 [[TMP2]], ptr getelementptr inbounds ([32 x i8], ptr @cle, i64 0, i64 1), align 1 +; SSE-NEXT: [[TMP3:%.*]] = xor i8 [[A]], [[C]] +; SSE-NEXT: store i8 [[TMP3]], ptr getelementptr inbounds ([32 x i8], ptr @cle, i64 0, i64 2), align 1 +; SSE-NEXT: [[TMP4:%.*]] = xor i8 [[A]], [[C]] +; SSE-NEXT: store i8 [[TMP4]], ptr getelementptr inbounds ([32 x i8], ptr @cle, i64 0, i64 3), align 1 +; SSE-NEXT: [[TMP5:%.*]] = xor i8 [[C]], [[A]] +; SSE-NEXT: store i8 [[TMP5]], ptr getelementptr inbounds ([32 x i8], ptr @cle, i64 0, i64 4), align 1 +; SSE-NEXT: [[TMP6:%.*]] = xor i8 [[C]], [[B:%.*]] +; SSE-NEXT: store i8 [[TMP6]], ptr getelementptr inbounds ([32 x i8], ptr @cle, i64 0, i64 5), align 1 +; SSE-NEXT: [[TMP7:%.*]] = xor i8 [[C]], [[A]] +; SSE-NEXT: store i8 [[TMP7]], ptr getelementptr inbounds ([32 x i8], ptr @cle, i64 0, i64 6), align 1 +; SSE-NEXT: [[TMP8:%.*]] = xor i8 [[C]], [[B]] +; SSE-NEXT: store i8 [[TMP8]], ptr getelementptr inbounds ([32 x i8], ptr @cle, i64 0, i64 7), align 1 +; SSE-NEXT: [[TMP9:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i32 0 +; SSE-NEXT: [[TMP10:%.*]] = shufflevector <8 x i8> [[TMP9]], <8 x i8> poison, <8 x i32> zeroinitializer +; SSE-NEXT: [[TMP11:%.*]] = insertelement <8 x i8> poison, i8 [[C]], i32 0 +; SSE-NEXT: [[TMP12:%.*]] = shufflevector <8 x i8> [[TMP11]], <8 x i8> poison, <8 x i32> zeroinitializer +; SSE-NEXT: [[TMP13:%.*]] = xor <8 x i8> [[TMP10]], [[TMP12]] +; SSE-NEXT: store <8 x i8> [[TMP13]], ptr getelementptr inbounds ([32 x i8], ptr @cle, i64 0, i64 8), align 1 ; SSE-NEXT: ret void ; ; AVX-LABEL: @splat( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+sse2 | FileCheck %s --check-prefixes=SSE ; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx | FileCheck %s --check-prefixes=AVX -; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx2 | FileCheck %s --check-prefixes=AVX +; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 ; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512 ; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512 @@ -25,20 +25,33 @@ ; ; AVX-LABEL: @foo( ; AVX-NEXT: [[TMP1:%.*]] = load i32, ptr @b, align 16 +; AVX-NEXT: store i32 [[TMP1]], ptr @a, align 16 ; AVX-NEXT: [[TMP2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @b, i64 0, i64 2), align 8 -; AVX-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[TMP1]], i64 0 -; AVX-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP2]], i64 1 -; AVX-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> -; AVX-NEXT: store <8 x i32> [[SHUFFLE]], ptr @a, align 16 +; AVX-NEXT: store i32 [[TMP2]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 1), align 4 +; AVX-NEXT: store i32 [[TMP1]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 2), align 8 +; AVX-NEXT: store i32 [[TMP2]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 3), align 4 +; AVX-NEXT: store i32 [[TMP1]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 4), align 16 +; AVX-NEXT: store i32 [[TMP2]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 5), align 4 +; AVX-NEXT: store i32 [[TMP1]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 6), align 8 +; AVX-NEXT: store i32 [[TMP2]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 7), align 4 ; AVX-NEXT: ret void ; +; AVX2-LABEL: @foo( +; AVX2-NEXT: [[TMP1:%.*]] = load i32, ptr @b, align 16 +; AVX2-NEXT: [[TMP2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @b, i64 0, i64 2), align 8 +; AVX2-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[TMP1]], i64 0 +; AVX2-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP2]], i64 1 +; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> +; AVX2-NEXT: store <8 x i32> [[TMP5]], ptr @a, align 16 +; AVX2-NEXT: ret void +; ; AVX512-LABEL: @foo( ; AVX512-NEXT: [[TMP1:%.*]] = load i32, ptr @b, align 16 ; AVX512-NEXT: [[TMP2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @b, i64 0, i64 2), align 8 ; AVX512-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[TMP1]], i64 0 ; AVX512-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP2]], i64 1 -; AVX512-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> -; AVX512-NEXT: store <8 x i32> [[SHUFFLE]], ptr @a, align 16 +; AVX512-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> +; AVX512-NEXT: store <8 x i32> [[TMP5]], ptr @a, align 16 ; AVX512-NEXT: ret void ; %1 = load i32, ptr @b, align 16 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reused-extractelements.ll b/llvm/test/Transforms/SLPVectorizer/X86/reused-extractelements.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reused-extractelements.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reused-extractelements.ll @@ -2,23 +2,24 @@ ; RUN: opt < %s -passes=slp-vectorizer -S -o - -mtriple=x86_64-unknown-linux -mcpu=bdver2 -pass-remarks-output=%t | FileCheck %s ; RUN: FileCheck --input-file=%t --check-prefix=YAML %s -; YAML: --- !Passed +; YAML: --- !Missed ; YAML-NEXT: Pass: slp-vectorizer -; YAML-NEXT: Name: VectorizedList +; YAML-NEXT: Name: NotBeneficial ; YAML-NEXT: Function: g ; YAML-NEXT: Args: -; YAML-NEXT: - String: 'SLP vectorized with cost ' -; YAML-NEXT: - Cost: '-1' -; YAML-NEXT: - String: ' and with tree size ' -; YAML-NEXT: - TreeSize: '4' +; YAML-NEXT: - String: 'List vectorization was possible but not beneficial with cost ' +; YAML-NEXT: - Cost: '0' +; YAML-NEXT: - String: ' >= ' +; YAML-NEXT: - Treshold: '0' define <2 x i32> @g(<2 x i32> %x, i32 %a, i32 %b) { ; CHECK-LABEL: @g( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[X:%.*]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[A:%.*]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[B:%.*]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = mul <2 x i32> [[TMP2]], [[TMP3]] -; CHECK-NEXT: ret <2 x i32> [[TMP4]] +; CHECK-NEXT: [[X1:%.*]] = extractelement <2 x i32> [[X:%.*]], i32 1 +; CHECK-NEXT: [[X1X1:%.*]] = mul i32 [[X1]], [[X1]] +; CHECK-NEXT: [[AB:%.*]] = mul i32 [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[INS1:%.*]] = insertelement <2 x i32> poison, i32 [[X1X1]], i32 0 +; CHECK-NEXT: [[INS2:%.*]] = insertelement <2 x i32> [[INS1]], i32 [[AB]], i32 1 +; CHECK-NEXT: ret <2 x i32> [[INS2]] ; %x1 = extractelement <2 x i32> %x, i32 1 %x1x1 = mul i32 %x1, %x1