diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -6286,6 +6286,262 @@ return {VK, VP}; } +namespace { +/// The base class for shuffle instruction emission and shuffle cost estimation. +class BaseShuffleAnalysis { +protected: + /// Checks if the mask is an identity mask. + /// \param IsStrict if is true the function returns false if mask size does + /// not match vector size. + static bool isIdentityMask(ArrayRef Mask, const FixedVectorType *VecTy, + bool IsStrict) { + int Limit = Mask.size(); + int VF = VecTy->getNumElements(); + return (VF == Limit || !IsStrict) && + all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) && + ShuffleVectorInst::isIdentityMask(Mask); + } + + /// Tries to combine 2 different masks into single one. + static void combineMasks(unsigned LocalVF, SmallVectorImpl &Mask, + ArrayRef ExtMask) { + unsigned VF = Mask.size(); + SmallVector NewMask(ExtMask.size(), UndefMaskElem); + for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) { + if (ExtMask[I] == UndefMaskElem) + continue; + int MaskedIdx = Mask[ExtMask[I] % VF]; + NewMask[I] = + MaskedIdx == UndefMaskElem ? UndefMaskElem : MaskedIdx % LocalVF; + } + Mask.swap(NewMask); + } + + /// Looks through shuffles trying to reduce final number of shuffles in the + /// code. The function looks through the previously emitted shuffle + /// instructions and properly mark indices in mask as undef. + /// For example, given the code + /// \code + /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0> + /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0> + /// \endcode + /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will + /// look through %s1 and %s2 and emit + /// \code + /// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3> + /// \endcode + /// instead. + /// If 2 operands are of different size, the smallest one will be resized and + /// the mask recalculated properly. + /// For example, given the code + /// \code + /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0> + /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0> + /// \endcode + /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will + /// look through %s1 and %s2 and emit + /// \code + /// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3> + /// \endcode + /// instead. + /// So, it tries to transform permutations to simple vector merge, if + /// possible. + /// \param V The input vector which must be shuffled using the given \p Mask. + /// If the better candidate is found, \p V is set to this best candidate + /// vector. + /// \param Mask The input mask for the shuffle. If the best candidate is found + /// during looking-through-shuffles attempt, it is updated accordingly. + /// \param SinglePermute true if the shuffle operation is originally a + /// single-value-permutation. In this case the look-through-shuffles procedure + /// may look for resizing shuffles as the best candidates. + static bool peekThroughShuffles(Value *&V, SmallVectorImpl &Mask, + bool SinglePermute) { + Value *Op = V; + ShuffleVectorInst *IdentityOp = nullptr; + SmallVector IdentityMask; + while (auto *SV = dyn_cast(Op)) { + // Exit if not a fixed vector type or changing size shuffle. + auto *SVTy = dyn_cast(SV->getType()); + if (!SVTy) + break; + // Remember the identity or broadcast mask, if it is not a resizing + // shuffle. If no better candidates are found, this Op and Mask will be + // used in the final shuffle. + if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) { + if (!IdentityOp || !SinglePermute || + (isIdentityMask(Mask, SVTy, /*IsStrict=*/true) && + !ShuffleVectorInst::isZeroEltSplatMask(IdentityMask))) { + IdentityOp = SV; + // Store current mask in the IdentityMask so later we did not lost + // this info if IdentityOp is selected as the best candidate for the + // permutation. + IdentityMask.assign(Mask); + } + } + // Remember the broadcast mask. If no better candidates are found, this Op + // and Mask will be used in the final shuffle. + if (SV->isZeroEltSplat()) { + IdentityOp = SV; + IdentityMask.assign(Mask); + } + int LocalVF = Mask.size(); + if (auto *SVOpTy = + dyn_cast(SV->getOperand(0)->getType())) + LocalVF = SVOpTy->getNumElements(); + bool IsOp1Undef = + isUndefVector(SV->getOperand(0), + buildUseMask(LocalVF, Mask, UseMask::FirstArg)) + .all(); + bool IsOp2Undef = + isUndefVector(SV->getOperand(1), + buildUseMask(LocalVF, Mask, UseMask::SecondArg)) + .all(); + if (!IsOp1Undef && !IsOp2Undef) { + // Update mask and mark undef elems. + for (auto [Idx, I] : enumerate(Mask)) { + if (I == UndefMaskElem) + continue; + if (SV->getShuffleMask()[I % SV->getShuffleMask().size()] == + UndefMaskElem) + I = UndefMaskElem; + } + break; + } + SmallVector ShuffleMask(SV->getShuffleMask().begin(), + SV->getShuffleMask().end()); + combineMasks(LocalVF, ShuffleMask, Mask); + Mask.swap(ShuffleMask); + if (IsOp2Undef) + Op = SV->getOperand(0); + else + Op = SV->getOperand(1); + } + if (auto *OpTy = dyn_cast(Op->getType()); + !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute)) { + if (IdentityOp) { + V = IdentityOp; + assert(Mask.size() == IdentityMask.size() && + "Expected masks of same sizes."); + // Clear known poison elements. + for (auto [I, Idx] : enumerate(Mask)) + if (Idx == UndefMaskElem) + IdentityMask[I] = UndefMaskElem; + Mask.swap(IdentityMask); + auto *Shuffle = dyn_cast(V); + return SinglePermute && + (isIdentityMask(Mask, cast(V->getType()), + /*IsStrict=*/true) || + (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() && + Shuffle->isZeroEltSplat() && + ShuffleVectorInst::isZeroEltSplatMask(Mask))); + } + V = Op; + return false; + } + V = Op; + return true; + } + + /// Smart shuffle instruction emission, walks through shuffles trees and + /// tries to find the best matching vector for the actual shuffle + /// instruction. + template + static Value *createShuffle(Value *V1, Value *V2, ArrayRef Mask, + ShuffleBuilderTy &Builder) { + assert(V1 && "Expected at least one vector value."); + int VF = Mask.size(); + if (auto *FTy = dyn_cast(V1->getType())) + VF = FTy->getNumElements(); + if (V2 && + !isUndefVector(V2, buildUseMask(VF, Mask, UseMask::SecondArg)).all()) { + // Peek through shuffles. + Value *Op1 = V1; + Value *Op2 = V2; + int VF = + cast(V1->getType())->getElementCount().getKnownMinValue(); + SmallVector CombinedMask1(Mask.size(), UndefMaskElem); + SmallVector CombinedMask2(Mask.size(), UndefMaskElem); + for (int I = 0, E = Mask.size(); I < E; ++I) { + if (Mask[I] < VF) + CombinedMask1[I] = Mask[I]; + else + CombinedMask2[I] = Mask[I] - VF; + } + Value *PrevOp1; + Value *PrevOp2; + do { + PrevOp1 = Op1; + PrevOp2 = Op2; + (void)peekThroughShuffles(Op1, CombinedMask1, /*SinglePermute=*/false); + (void)peekThroughShuffles(Op2, CombinedMask2, /*SinglePermute=*/false); + // Check if we have 2 resizing shuffles - need to peek through operands + // again. + if (auto *SV1 = dyn_cast(Op1)) + if (auto *SV2 = dyn_cast(Op2)) { + SmallBitVector UseMask1 = buildUseMask( + cast(SV1->getOperand(1)->getType()) + ->getNumElements(), + CombinedMask1, UseMask::FirstArg); + SmallBitVector UseMask2 = buildUseMask( + cast(SV2->getOperand(1)->getType()) + ->getNumElements(), + CombinedMask2, UseMask::FirstArg); + if (SV1->getOperand(0)->getType() == + SV2->getOperand(0)->getType() && + SV1->getOperand(0)->getType() != SV1->getType() && + isUndefVector(SV1->getOperand(1), UseMask1).all() && + isUndefVector(SV2->getOperand(1), UseMask2).all()) { + Op1 = SV1->getOperand(0); + Op2 = SV2->getOperand(0); + SmallVector ShuffleMask1(SV1->getShuffleMask().begin(), + SV1->getShuffleMask().end()); + int LocalVF = ShuffleMask1.size(); + if (auto *FTy = dyn_cast(Op1->getType())) + LocalVF = FTy->getNumElements(); + combineMasks(LocalVF, ShuffleMask1, CombinedMask1); + CombinedMask1.swap(ShuffleMask1); + SmallVector ShuffleMask2(SV2->getShuffleMask().begin(), + SV2->getShuffleMask().end()); + LocalVF = ShuffleMask2.size(); + if (auto *FTy = dyn_cast(Op2->getType())) + LocalVF = FTy->getNumElements(); + combineMasks(LocalVF, ShuffleMask2, CombinedMask2); + CombinedMask2.swap(ShuffleMask2); + } + } + } while (PrevOp1 != Op1 || PrevOp2 != Op2); + Builder.resizeToMatch(Op1, Op2); + VF = std::max(cast(Op1->getType()) + ->getElementCount() + .getKnownMinValue(), + cast(Op2->getType()) + ->getElementCount() + .getKnownMinValue()); + for (int I = 0, E = Mask.size(); I < E; ++I) { + if (CombinedMask2[I] != UndefMaskElem) { + assert(CombinedMask1[I] == UndefMaskElem && + "Expected undefined mask element"); + CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF); + } + } + return Builder.createShuffleVector( + Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2, + CombinedMask1); + } + if (isa(V1)) + return PoisonValue::get(FixedVectorType::get( + cast(V1->getType())->getElementType(), Mask.size())); + SmallVector NewMask(Mask.begin(), Mask.end()); + bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true); + assert(V1 && "Expected non-null value after looking through shuffles."); + + if (!IsIdentity) + return Builder.createShuffleVector(V1, NewMask); + return V1; + } +}; +} // namespace + InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals) { ArrayRef VL = E->Scalars; @@ -8146,7 +8402,7 @@ /// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3> /// \endcode /// instead. -class BoUpSLP::ShuffleInstructionBuilder { +class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { bool IsFinalized = false; /// Combined mask for all applied operands and masks. It is built during /// analysis and actual emission of shuffle vector instructions. @@ -8155,7 +8411,7 @@ /// operands, if the 3rd is going to be added, the first 2 are combined into /// shuffle with \p CommonMask mask, the first operand sets to be the /// resulting shuffle and the second operand sets to be the newly added - /// operand. The \p CombinedMask is transformed in the proper way after that. + /// operand. The \p CommonMask is transformed in the proper way after that. SmallVector InVectors; IRBuilderBase &Builder; BoUpSLP &R; @@ -8199,6 +8455,29 @@ } return Vec; } + /// Resizes 2 input vector to match the sizes, if the they are not equal + /// yet. The smallest vector is resized to the size of the larger vector. + void resizeToMatch(Value *&V1, Value *&V2) { + if (V1->getType() == V2->getType()) + return; + int V1VF = cast(V1->getType())->getNumElements(); + int V2VF = cast(V2->getType())->getNumElements(); + int VF = std::max(V1VF, V2VF); + int MinVF = std::min(V1VF, V2VF); + SmallVector IdentityMask(VF, UndefMaskElem); + std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF), + 0); + Value *&Op = MinVF == V1VF ? V1 : V2; + Op = Builder.CreateShuffleVector(Op, IdentityMask); + if (auto *I = dyn_cast(Op)) { + GatherShuffleExtractSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } + if (MinVF == V1VF) + V1 = Op; + else + V2 = Op; + } }; /// Smart shuffle instruction emission, walks through shuffles trees and @@ -8208,9 +8487,7 @@ assert(V1 && "Expected at least one vector value."); ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq, R.CSEBlocks); - if (V2) - return ShuffleBuilder.createShuffleVector(V1, V2, Mask); - return ShuffleBuilder.createShuffleVector(V1, Mask); + return BaseShuffleAnalysis::createShuffle(V1, V2, Mask, ShuffleBuilder); } /// Transforms mask \p CommonMask per given \p Mask to make proper set after @@ -9433,157 +9710,21 @@ LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n"); } - // Checks if the mask is an identity mask. - auto &&IsIdentityMask = [](ArrayRef Mask, FixedVectorType *VecTy) { - int Limit = Mask.size(); - return VecTy->getNumElements() == Mask.size() && - all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) && - ShuffleVectorInst::isIdentityMask(Mask); - }; - // Tries to combine 2 different masks into single one. - auto &&CombineMasks = [](SmallVectorImpl &Mask, ArrayRef ExtMask) { - SmallVector NewMask(ExtMask.size(), UndefMaskElem); - for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) { - if (ExtMask[I] == UndefMaskElem) - continue; - NewMask[I] = Mask[ExtMask[I]]; - } - Mask.swap(NewMask); - }; - // Peek through shuffles, trying to simplify the final shuffle code. - auto &&PeekThroughShuffles = - [&IsIdentityMask, &CombineMasks](Value *&V, SmallVectorImpl &Mask, - bool CheckForLengthChange = false) { - while (auto *SV = dyn_cast(V)) { - // Exit if not a fixed vector type or changing size shuffle. - if (!isa(SV->getType()) || - (CheckForLengthChange && SV->changesLength())) - break; - // Exit if the identity or broadcast mask is found. - if (IsIdentityMask(Mask, cast(SV->getType())) || - SV->isZeroEltSplat()) - break; - int LocalVF = Mask.size(); - if (auto *SVOpTy = - dyn_cast(SV->getOperand(0)->getType())) - LocalVF = SVOpTy->getNumElements(); - bool IsOp1Undef = - isUndefVector(SV->getOperand(0), - buildUseMask(LocalVF, Mask, UseMask::FirstArg)) - .all(); - bool IsOp2Undef = - isUndefVector(SV->getOperand(1), - buildUseMask(LocalVF, Mask, UseMask::SecondArg)) - .all(); - if (!IsOp1Undef && !IsOp2Undef) - break; - SmallVector ShuffleMask(SV->getShuffleMask().begin(), - SV->getShuffleMask().end()); - CombineMasks(ShuffleMask, Mask); - Mask.swap(ShuffleMask); - if (IsOp2Undef) - V = SV->getOperand(0); - else - V = SV->getOperand(1); - } - }; - // Smart shuffle instruction emission, walks through shuffles trees and - // tries to find the best matching vector for the actual shuffle - // instruction. - auto &&CreateShuffle = [this, &IsIdentityMask, &PeekThroughShuffles, - &CombineMasks](Value *V1, Value *V2, - ArrayRef Mask) -> Value * { - assert(V1 && "Expected at least one vector value."); - int VF = Mask.size(); - if (auto *FTy = dyn_cast(V1->getType())) - VF = FTy->getNumElements(); - if (V2 && - !isUndefVector(V2, buildUseMask(VF, Mask, UseMask::SecondArg)).all()) { - // Peek through shuffles. - Value *Op1 = V1; - Value *Op2 = V2; - int VF = - cast(V1->getType())->getElementCount().getKnownMinValue(); - SmallVector CombinedMask1(Mask.size(), UndefMaskElem); - SmallVector CombinedMask2(Mask.size(), UndefMaskElem); - for (int I = 0, E = Mask.size(); I < E; ++I) { - if (Mask[I] < VF) - CombinedMask1[I] = Mask[I]; - else - CombinedMask2[I] = Mask[I] - VF; - } - Value *PrevOp1; - Value *PrevOp2; - do { - PrevOp1 = Op1; - PrevOp2 = Op2; - PeekThroughShuffles(Op1, CombinedMask1, /*CheckForLengthChange=*/true); - PeekThroughShuffles(Op2, CombinedMask2, /*CheckForLengthChange=*/true); - // Check if we have 2 resizing shuffles - need to peek through operands - // again. - if (auto *SV1 = dyn_cast(Op1)) - if (auto *SV2 = dyn_cast(Op2)) { - SmallBitVector UseMask1 = buildUseMask( - cast(SV1->getOperand(1)->getType()) - ->getNumElements(), - CombinedMask1, UseMask::FirstArg); - SmallBitVector UseMask2 = buildUseMask( - cast(SV2->getOperand(1)->getType()) - ->getNumElements(), - CombinedMask2, UseMask::FirstArg); - if (SV1->getOperand(0)->getType() == - SV2->getOperand(0)->getType() && - SV1->getOperand(0)->getType() != SV1->getType() && - isUndefVector(SV1->getOperand(1), UseMask1).all() && - isUndefVector(SV2->getOperand(1), UseMask2).all()) { - Op1 = SV1->getOperand(0); - Op2 = SV2->getOperand(0); - SmallVector ShuffleMask1(SV1->getShuffleMask().begin(), - SV1->getShuffleMask().end()); - CombineMasks(ShuffleMask1, CombinedMask1); - CombinedMask1.swap(ShuffleMask1); - SmallVector ShuffleMask2(SV2->getShuffleMask().begin(), - SV2->getShuffleMask().end()); - CombineMasks(ShuffleMask2, CombinedMask2); - CombinedMask2.swap(ShuffleMask2); - } - } - } while (PrevOp1 != Op1 || PrevOp2 != Op2); - VF = cast(Op1->getType()) - ->getElementCount() - .getKnownMinValue(); - for (int I = 0, E = Mask.size(); I < E; ++I) { - if (CombinedMask2[I] != UndefMaskElem) { - assert(CombinedMask1[I] == UndefMaskElem && - "Expected undefined mask element"); - CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF); - } - } - Value *Vec = Builder.CreateShuffleVector( - Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2, - CombinedMask1); - if (auto *I = dyn_cast(Vec)) { - GatherShuffleExtractSeq.insert(I); - CSEBlocks.insert(I->getParent()); - } - return Vec; - } - if (isa(V1)) - return PoisonValue::get(FixedVectorType::get( - cast(V1->getType())->getElementType(), Mask.size())); - Value *Op = V1; - SmallVector CombinedMask(Mask); - PeekThroughShuffles(Op, CombinedMask); - if (!isa(Op->getType()) || - !IsIdentityMask(CombinedMask, cast(Op->getType()))) { - Value *Vec = Builder.CreateShuffleVector(Op, CombinedMask); - if (auto *I = dyn_cast(Vec)) { - GatherShuffleExtractSeq.insert(I); - CSEBlocks.insert(I->getParent()); - } - return Vec; + auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef Mask) { + SmallVector CombinedMask1(Mask.size(), UndefMaskElem); + SmallVector CombinedMask2(Mask.size(), UndefMaskElem); + int VF = cast(V1->getType())->getNumElements(); + for (int I = 0, E = Mask.size(); I < E; ++I) { + if (Mask[I] < VF) + CombinedMask1[I] = Mask[I]; + else + CombinedMask2[I] = Mask[I] - VF; } - return Op; + ShuffleInstructionBuilder ShuffleBuilder(Builder, *this); + ShuffleBuilder.add(V1, CombinedMask1); + if (V2) + ShuffleBuilder.add(V2, CombinedMask2); + return ShuffleBuilder.finalize(std::nullopt); }; auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef Mask, diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll @@ -30,11 +30,11 @@ ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <4 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> [[TMP8]], <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP4]], <4 x i32> -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = fmul fast <4 x float> [[TMP9]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP12:%.*]] = bitcast float* [[GEP0]] to <4 x float>* -; CHECK-NEXT: store <4 x float> [[TMP11]], <4 x float>* [[TMP12]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> [[TMP8]], <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = fmul fast <4 x float> [[TMP9]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast float* [[GEP0]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP12]], <4 x float>* [[TMP13]], align 4 ; CHECK-NEXT: ret void ; %gep0 = getelementptr inbounds float, float* %a, i64 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-same-lane-insert.ll b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-same-lane-insert.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-same-lane-insert.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-same-lane-insert.ll @@ -43,10 +43,9 @@ ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1 ; CHECK-NEXT: [[TMP7:%.*]] = fcmp olt float [[TMP6]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> +; CHECK-NEXT: store <2 x float> [[TMP3]], ptr null, align 4 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP2]], <2 x i32> ; CHECK-NEXT: store <2 x float> [[TMP8]], ptr null, align 4 -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP2]], <2 x i32> -; CHECK-NEXT: store <2 x float> [[TMP9]], ptr null, align 4 ; CHECK-NEXT: ret void ; %1 = getelementptr inbounds float, ptr undef, i32 2 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll b/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll @@ -26,26 +26,27 @@ ; CHECK-NEXT: [[TMP7:%.*]] = sitofp <2 x i32> [[TMP6]] to <2 x float> ; CHECK-NEXT: [[TMP8:%.*]] = fmul <2 x float> [[TMP7]], ; CHECK-NEXT: [[TMP9:%.*]] = fsub <2 x float> , [[TMP8]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 1 -; CHECK-NEXT: store float [[TMP10]], float* @g, align 4 -; CHECK-NEXT: [[TMP11:%.*]] = fadd <4 x float> [[SHUFFLE]], -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[TMP11]], i32 2 -; CHECK-NEXT: store float [[TMP12]], float* @c, align 4 -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP11]], i32 0 -; CHECK-NEXT: store float [[TMP13]], float* @d, align 4 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[TMP11]], i32 3 -; CHECK-NEXT: store float [[TMP14]], float* @e, align 4 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[TMP11]], i32 1 -; CHECK-NEXT: store float [[TMP15]], float* @f, align 4 -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x float> , float [[CONV19]], i32 0 -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x float> [[TMP16]], <4 x float> [[SHUFFLE]], <4 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = fsub <4 x float> [[TMP11]], [[TMP17]] -; CHECK-NEXT: [[TMP19:%.*]] = fadd <4 x float> [[TMP11]], [[TMP17]] -; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <4 x float> [[TMP18]], <4 x float> [[TMP19]], <4 x i32> -; CHECK-NEXT: [[TMP21:%.*]] = fptosi <4 x float> [[TMP20]] to <4 x i32> -; CHECK-NEXT: [[TMP22:%.*]] = bitcast i32* [[ARRAYIDX1]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP21]], <4 x i32>* [[TMP22]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[TMP10]], i32 1 +; CHECK-NEXT: store float [[TMP11]], float* @g, align 4 +; CHECK-NEXT: [[TMP12:%.*]] = fadd <4 x float> [[TMP10]], +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP12]], i32 2 +; CHECK-NEXT: store float [[TMP13]], float* @c, align 4 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[TMP12]], i32 0 +; CHECK-NEXT: store float [[TMP14]], float* @d, align 4 +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[TMP12]], i32 3 +; CHECK-NEXT: store float [[TMP15]], float* @e, align 4 +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x float> [[TMP12]], i32 1 +; CHECK-NEXT: store float [[TMP16]], float* @f, align 4 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x float> , float [[CONV19]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x float> [[TMP17]], <4 x float> [[TMP18]], <4 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = fsub <4 x float> [[TMP12]], [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = fadd <4 x float> [[TMP12]], [[TMP19]] +; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x float> [[TMP20]], <4 x float> [[TMP21]], <4 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = fptosi <4 x float> [[TMP22]] to <4 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = bitcast i32* [[ARRAYIDX1]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP23]], <4 x i32>* [[TMP24]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll @@ -13,34 +13,35 @@ ; CHECK-NEXT: [[PH:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX12:%.*]], [[WHILE]] ] ; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr null, align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[A2]], align 8 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr null, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[A]], align 8 -; CHECK-NEXT: [[SHUFFLE13:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> poison, <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x i64> poison, i64 [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <16 x i64> [[TMP4]], <16 x i64> [[TMP5]], <16 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[SHUFFLE]], <4 x i64> poison, <16 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i64> [[TMP6]], <16 x i64> [[TMP7]], <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <16 x i64> [[TMP8]], <16 x i64> [[TMP9]], <16 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x i64> [[TMP10]], i64 [[TMP0]], i32 9 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <16 x i64> [[TMP11]], i64 [[TMP0]], i32 10 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <16 x i64> [[TMP12]], i64 [[TMP0]], i32 11 -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i64> [[SHUFFLE13]], <4 x i64> poison, <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <16 x i64> [[TMP13]], <16 x i64> [[TMP14]], <16 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = load i64, ptr [[A1]], align 16 -; CHECK-NEXT: [[TMP17:%.*]] = load i64, ptr [[A2]], align 8 -; CHECK-NEXT: [[TMP18:%.*]] = load i64, ptr [[A3]], align 16 -; CHECK-NEXT: [[TMP19:%.*]] = call i64 @llvm.vector.reduce.xor.v16i64(<16 x i64> [[TMP15]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = xor i64 [[TMP19]], [[TMP2]] -; CHECK-NEXT: [[OP_RDX1:%.*]] = xor i64 [[TMP2]], [[TMP2]] -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i64> [[SHUFFLE13]], i32 3 -; CHECK-NEXT: [[OP_RDX2:%.*]] = xor i64 [[TMP2]], [[TMP20]] -; CHECK-NEXT: [[OP_RDX3:%.*]] = xor i64 [[TMP20]], [[TMP16]] -; CHECK-NEXT: [[OP_RDX4:%.*]] = xor i64 [[TMP16]], [[TMP16]] -; CHECK-NEXT: [[OP_RDX5:%.*]] = xor i64 [[TMP17]], [[TMP17]] -; CHECK-NEXT: [[OP_RDX6:%.*]] = xor i64 [[TMP18]], [[TMP18]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr null, align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i64>, ptr [[A]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> poison, <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i64> poison, i64 [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> poison, <16 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x i64> [[TMP6]], <16 x i64> [[TMP8]], <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> poison, <16 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <16 x i64> [[TMP9]], <16 x i64> [[TMP10]], <16 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> poison, <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <16 x i64> [[TMP11]], <16 x i64> [[TMP8]], <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <16 x i64> [[TMP13]], i64 [[TMP0]], i32 9 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <16 x i64> [[TMP14]], i64 [[TMP0]], i32 10 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <16 x i64> [[TMP15]], i64 [[TMP0]], i32 11 +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <16 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <16 x i64> [[TMP16]], <16 x i64> [[TMP17]], <16 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = load i64, ptr [[A1]], align 16 +; CHECK-NEXT: [[TMP20:%.*]] = load i64, ptr [[A2]], align 8 +; CHECK-NEXT: [[TMP21:%.*]] = load i64, ptr [[A3]], align 16 +; CHECK-NEXT: [[TMP22:%.*]] = call i64 @llvm.vector.reduce.xor.v16i64(<16 x i64> [[TMP18]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = xor i64 [[TMP22]], [[TMP3]] +; CHECK-NEXT: [[OP_RDX1:%.*]] = xor i64 [[TMP3]], [[TMP3]] +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i64> [[TMP5]], i32 3 +; CHECK-NEXT: [[OP_RDX2:%.*]] = xor i64 [[TMP3]], [[TMP23]] +; CHECK-NEXT: [[OP_RDX3:%.*]] = xor i64 [[TMP23]], [[TMP19]] +; CHECK-NEXT: [[OP_RDX4:%.*]] = xor i64 [[TMP19]], [[TMP19]] +; CHECK-NEXT: [[OP_RDX5:%.*]] = xor i64 [[TMP20]], [[TMP20]] +; CHECK-NEXT: [[OP_RDX6:%.*]] = xor i64 [[TMP21]], [[TMP21]] ; CHECK-NEXT: [[OP_RDX7:%.*]] = xor i64 [[OP_RDX]], [[OP_RDX1]] ; CHECK-NEXT: [[OP_RDX8:%.*]] = xor i64 [[OP_RDX2]], [[OP_RDX3]] ; CHECK-NEXT: [[OP_RDX9:%.*]] = xor i64 [[OP_RDX4]], [[OP_RDX5]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll @@ -4,16 +4,17 @@ define void @test(float* noalias %0, float* %p) { ; CHECK-LABEL: @test( ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x float*> poison, float* [[P:%.*]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float*> [[TMP2]], <8 x float*> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE]], <8 x i64> -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP0:%.*]], i64 2 -; CHECK-NEXT: [[TMP5:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP3]], i32 4, <8 x i1> , <8 x float> poison) -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> poison, <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <16 x float> , <16 x float> [[SHUFFLE1]], <16 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = fadd reassoc nsz arcp contract afn <16 x float> [[SHUFFLE1]], [[TMP6]] -; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <16 x float> [[TMP7]], <16 x float> poison, <16 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP4]] to <16 x float>* -; CHECK-NEXT: store <16 x float> [[SHUFFLE2]], <16 x float>* [[TMP8]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x float*> [[TMP2]], <8 x float*> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr float, <8 x float*> [[TMP3]], <8 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP0:%.*]], i64 2 +; CHECK-NEXT: [[TMP6:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP4]], i32 4, <8 x i1> , <8 x float> poison) +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x float> [[TMP6]], <8 x float> poison, <16 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x float> [[TMP6]], <8 x float> poison, <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x float> , <16 x float> [[TMP8]], <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = fadd reassoc nsz arcp contract afn <16 x float> [[TMP7]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <16 x float> [[TMP10]], <16 x float> poison, <16 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast float* [[TMP5]] to <16 x float>* +; CHECK-NEXT: store <16 x float> [[TMP11]], <16 x float>* [[TMP12]], align 4 ; CHECK-NEXT: ret void ; %2 = getelementptr inbounds float, float* %p, i64 2