diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -6290,6 +6290,122 @@ return false; } +/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the +/// buildvector sequence. +static bool isFirstInsertElement(const InsertElementInst *IE1, + const InsertElementInst *IE2) { + const auto *I1 = IE1; + const auto *I2 = IE2; + do { + if (I2 == IE1) + return true; + if (I1 == IE2) + return false; + if (I1) + I1 = dyn_cast(I1->getOperand(0)); + if (I2) + I2 = dyn_cast(I2->getOperand(0)); + } while (I1 || I2); + llvm_unreachable("Two different buildvectors not expected."); +} + +/// Does the analysis of the provided shuffle masks and performs the requested +/// actions on the vectors with the given shuffle masks. It tries to do it in +/// several steps. +/// 1. If the Base vector is not undef vector, resizing the very first mask to +/// have common VF and perform action for 2 input vectors (including non-undef +/// Base). Other shuffle masks are combined with the resulting after the 1 stage +/// and processed as a shuffle of 2 elements. +/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the +/// action only for 1 vector with the given mask, if it is not the identity +/// mask. +/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2 +/// vectors, combing the masks properly between the steps. +template +static T *performExtractsShuffleAction( + MutableArrayRef>> ShuffleMask, Value *Base, + function_ref GetVF, + function_ref(T *, ArrayRef)> ResizeAction, + function_ref, ArrayRef)> Action) { + assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts."); + SmallVector Mask(ShuffleMask.begin()->second); + auto VMIt = std::next(ShuffleMask.begin()); + T *Prev = nullptr; + bool IsBaseNotUndef = !isUndefVector(Base); + if (IsBaseNotUndef) { + // Base is not undef, need to combine it with the next subvectors. + std::pair Res = ResizeAction(ShuffleMask.begin()->first, Mask); + for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) { + if (Mask[Idx] == UndefMaskElem) + Mask[Idx] = Idx; + else + Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF; + } + Prev = Action(Mask, {nullptr, Res.first}); + } else if (ShuffleMask.size() == 1) { + // Base is undef and only 1 vector is shuffled - perform the action only for + // single vector, if the mask is not the identity mask. + std::pair Res = ResizeAction(ShuffleMask.begin()->first, Mask); + if (Res.second) + // Identity mask is found. + Prev = Res.first; + else + Prev = Action(Mask, {ShuffleMask.begin()->first}); + } else { + // Base is undef and at least 2 input vectors shuffled - perform 2 vectors + // shuffles step by step, combining shuffle between the steps. + unsigned Vec1VF = GetVF(ShuffleMask.begin()->first); + unsigned Vec2VF = GetVF(VMIt->first); + if (Vec1VF == Vec2VF) { + // No need to resize the input vectors since they are of the same size, we + // can shuffle them directly. + ArrayRef SecMask = VMIt->second; + for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) { + if (SecMask[I] != UndefMaskElem) { + assert(Mask[I] == UndefMaskElem && "Multiple uses of scalars."); + Mask[I] = SecMask[I] + Vec1VF; + } + } + Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first}); + } else { + // Vectors of different sizes - resize and reshuffle. + std::pair Res1 = + ResizeAction(ShuffleMask.begin()->first, Mask); + std::pair Res2 = ResizeAction(VMIt->first, VMIt->second); + ArrayRef SecMask = VMIt->second; + for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) { + if (Mask[I] != UndefMaskElem) { + assert(SecMask[I] == UndefMaskElem && "Multiple uses of scalars."); + if (Res1.second) + Mask[I] = I; + } else if (SecMask[I] != UndefMaskElem) { + assert(Mask[I] == UndefMaskElem && "Multiple uses of scalars."); + Mask[I] = (Res2.second ? I : SecMask[I]) + VF; + } + } + Prev = Action(Mask, {Res1.first, Res2.first}); + } + VMIt = std::next(VMIt); + } + // Perform requested actions for the remaining masks/vectors. + for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) { + // Shuffle other input vectors, if any. + std::pair Res = ResizeAction(VMIt->first, VMIt->second); + ArrayRef SecMask = VMIt->second; + for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) { + if (SecMask[I] != UndefMaskElem) { + assert((Mask[I] == UndefMaskElem || IsBaseNotUndef) && + "Multiple uses of scalars."); + Mask[I] = (Res.second ? I : SecMask[I]) + VF; + } else if (Mask[I] != UndefMaskElem) { + Mask[I] = I; + } + } + Prev = Action(Mask, {Prev, Res.first}); + } + return Prev; +} + InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals) { InstructionCost Cost = 0; LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size " @@ -6310,9 +6426,8 @@ SmallPtrSet ExtractCostCalculated; InstructionCost ExtractCost = 0; - SmallVector VF; - SmallVector> ShuffleMask; - SmallVector FirstUsers; + SmallVector>> ShuffleMasks; + SmallVector> FirstUsers; SmallVector DemandedElts; for (ExternalUser &EU : ExternalUses) { // We only add extract cost once for the same scalar. @@ -6341,14 +6456,16 @@ if (auto *FTy = dyn_cast(VU->getType())) { Optional InsertIdx = getInsertIndex(VU); if (InsertIdx) { - auto *It = find_if(FirstUsers, [VU](Value *V) { - return areTwoInsertFromSameBuildVector(VU, - cast(V)); - }); + const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar); + auto *It = + find_if(FirstUsers, + [VU](const std::pair &Pair) { + return areTwoInsertFromSameBuildVector( + VU, cast(Pair.first)); + }); int VecId = -1; if (It == FirstUsers.end()) { - VF.push_back(FTy->getNumElements()); - ShuffleMask.emplace_back(VF.back(), UndefMaskElem); + (void)ShuffleMasks.emplace_back(); // Find the insertvector, vectorized in tree, if any. Value *Base = VU; while (auto *IEBase = dyn_cast(Base)) { @@ -6357,21 +6474,31 @@ VU = IEBase; do { int Idx = E->findLaneForValue(Base); - ShuffleMask.back()[Idx] = Idx; + SmallVectorImpl &Mask = ShuffleMasks.back()[ScalarTE]; + if (Mask.empty()) + Mask.assign(FTy->getNumElements(), UndefMaskElem); + Mask[Idx] = Idx; Base = cast(Base)->getOperand(0); } while (E == getTreeEntry(Base)); break; } Base = cast(Base)->getOperand(0); } - FirstUsers.push_back(VU); - DemandedElts.push_back(APInt::getZero(VF.back())); + FirstUsers.emplace_back(VU, ScalarTE); + DemandedElts.push_back(APInt::getZero(FTy->getNumElements())); VecId = FirstUsers.size() - 1; } else { + if (isFirstInsertElement(VU, cast(It->first))) + It->first = VU; VecId = std::distance(FirstUsers.begin(), It); } int InIdx = *InsertIdx; - ShuffleMask[VecId][InIdx] = EU.Lane; + SmallVectorImpl &Mask = ShuffleMasks[VecId][ScalarTE]; + if (Mask.empty()) + Mask.assign(FTy->getNumElements(), UndefMaskElem); + assert(Mask[InIdx] == UndefMaskElem && + "InsertElementInstruction used already."); + Mask[InIdx] = EU.Lane; DemandedElts[VecId].setBit(InIdx); continue; } @@ -6398,89 +6525,75 @@ InstructionCost SpillCost = getSpillCost(); Cost += SpillCost + ExtractCost; - if (FirstUsers.size() == 1) { - int Limit = ShuffleMask.front().size() * 2; - if (!all_of(ShuffleMask.front(), - [Limit](int Idx) { return Idx < Limit; }) || - !ShuffleVectorInst::isIdentityMask(ShuffleMask.front())) { - InstructionCost C = TTI->getShuffleCost( + auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef Mask) { + InstructionCost C = 0; + unsigned VF = Mask.size(); + unsigned VecVF = TE->getVectorFactor(); + if (VF != VecVF && + (any_of(Mask, [VF](int Idx) { return Idx >= static_cast(VF); }) || + (all_of(Mask, + [VF](int Idx) { return Idx < 2 * static_cast(VF); }) && + !ShuffleVectorInst::isIdentityMask(Mask)))) { + SmallVector OrigMask(VecVF, UndefMaskElem); + std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)), + OrigMask.begin()); + C = TTI->getShuffleCost( TTI::SK_PermuteSingleSrc, - cast(FirstUsers.front()->getType()), - ShuffleMask.front()); - LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C - << " for final shuffle of insertelement external users " - << *VectorizableTree.front()->Scalars.front() << ".\n" - << "SLP: Current total cost = " << Cost << "\n"); + FixedVectorType::get(TE->getMainOp()->getType(), VecVF), OrigMask); + LLVM_DEBUG( + dbgs() << "SLP: Adding cost " << C + << " for final shuffle of insertelement external users.\n"; + TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n"); Cost += C; + return std::make_pair(TE, true); } + return std::make_pair(TE, false); + }; + // Calculate the cost of the reshuffled vectors, if any. + for (int I = 0, E = FirstUsers.size(); I < E; ++I) { + Value *Base = cast(FirstUsers[I].first)->getOperand(0); + unsigned VF = ShuffleMasks[I].begin()->second.size(); + auto *FTy = FixedVectorType::get( + cast(FirstUsers[I].first->getType())->getElementType(), VF); + auto Vector = ShuffleMasks[I].takeVector(); + auto &&EstimateShufflesCost = [this, FTy, + &Cost](ArrayRef Mask, + ArrayRef TEs) { + assert((TEs.size() == 1 || TEs.size() == 2) && + "Expected exactly 1 or 2 tree entries."); + if (TEs.size() == 1) { + int Limit = 2 * Mask.size(); + if (!all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) || + !ShuffleVectorInst::isIdentityMask(Mask)) { + InstructionCost C = + TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FTy, Mask); + LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C + << " for final shuffle of insertelement " + "external users.\n"; + TEs.front()->dump(); + dbgs() << "SLP: Current total cost = " << Cost << "\n"); + Cost += C; + } + } else { + InstructionCost C = + TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, FTy, Mask); + LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C + << " for final shuffle of vector node and external " + "insertelement users.\n"; + if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump(); + dbgs() << "SLP: Current total cost = " << Cost << "\n"); + Cost += C; + } + return TEs.back(); + }; + (void)performExtractsShuffleAction( + makeMutableArrayRef(Vector.data(), Vector.size()), Base, + [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF, + EstimateShufflesCost); InstructionCost InsertCost = TTI->getScalarizationOverhead( - cast(FirstUsers.front()->getType()), - DemandedElts.front(), /*Insert*/ true, /*Extract*/ false); - LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost - << " for insertelements gather.\n" - << "SLP: Current total cost = " << Cost << "\n"); - Cost -= InsertCost; - } else if (FirstUsers.size() >= 2) { - unsigned MaxVF = *std::max_element(VF.begin(), VF.end()); - // Combined masks of the first 2 vectors. - SmallVector CombinedMask(MaxVF, UndefMaskElem); - copy(ShuffleMask.front(), CombinedMask.begin()); - APInt CombinedDemandedElts = DemandedElts.front().zextOrSelf(MaxVF); - auto *VecTy = FixedVectorType::get( - cast(FirstUsers.front()->getType())->getElementType(), - MaxVF); - for (int I = 0, E = ShuffleMask[1].size(); I < E; ++I) { - if (ShuffleMask[1][I] != UndefMaskElem) { - CombinedMask[I] = ShuffleMask[1][I] + MaxVF; - CombinedDemandedElts.setBit(I); - } - } - InstructionCost C = - TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VecTy, CombinedMask); - LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C - << " for final shuffle of vector node and external " - "insertelement users " - << *VectorizableTree.front()->Scalars.front() << ".\n" - << "SLP: Current total cost = " << Cost << "\n"); - Cost += C; - InstructionCost InsertCost = TTI->getScalarizationOverhead( - VecTy, CombinedDemandedElts, /*Insert*/ true, /*Extract*/ false); - LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost - << " for insertelements gather.\n" - << "SLP: Current total cost = " << Cost << "\n"); + cast(FirstUsers[I].first->getType()), DemandedElts[I], + /*Insert*/ true, /*Extract*/ false); Cost -= InsertCost; - for (int I = 2, E = FirstUsers.size(); I < E; ++I) { - if (ShuffleMask[I].empty()) - continue; - // Other elements - permutation of 2 vectors (the initial one and the - // next Ith incoming vector). - unsigned VF = ShuffleMask[I].size(); - for (unsigned Idx = 0; Idx < VF; ++Idx) { - int Mask = ShuffleMask[I][Idx]; - if (Mask != UndefMaskElem) - CombinedMask[Idx] = MaxVF + Mask; - else if (CombinedMask[Idx] != UndefMaskElem) - CombinedMask[Idx] = Idx; - } - for (unsigned Idx = VF; Idx < MaxVF; ++Idx) - if (CombinedMask[Idx] != UndefMaskElem) - CombinedMask[Idx] = Idx; - InstructionCost C = - TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VecTy, CombinedMask); - LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C - << " for final shuffle of vector node and external " - "insertelement users " - << *VectorizableTree.front()->Scalars.front() << ".\n" - << "SLP: Current total cost = " << Cost << "\n"); - Cost += C; - InstructionCost InsertCost = TTI->getScalarizationOverhead( - cast(FirstUsers[I]->getType()), DemandedElts[I], - /*Insert*/ true, /*Extract*/ false); - LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost - << " for insertelements gather.\n" - << "SLP: Current total cost = " << Cost << "\n"); - Cost -= InsertCost; - } } #ifndef NDEBUG diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_scheduling-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_scheduling-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_scheduling-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_scheduling-inseltpoison.ll @@ -11,25 +11,27 @@ ; CHECK-NEXT: [[TAB2:%.*]] = alloca [256 x i32], align 16 ; CHECK-NEXT: br label [[BB1:%.*]] ; CHECK: bb1: -; CHECK-NEXT: [[MUL19:%.*]] = fmul double [[P1:%.*]], 1.638400e+04 ; CHECK-NEXT: [[MUL20:%.*]] = fmul double [[P3:%.*]], 1.638400e+04 ; CHECK-NEXT: [[ADD:%.*]] = fadd double [[MUL20]], 8.192000e+03 -; CHECK-NEXT: [[MUL21:%.*]] = fmul double [[P2:%.*]], 1.638400e+04 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[P1:%.*]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[P2:%.*]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> , double [[ADD]], i32 1 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV266:%.*]] = phi i64 [ 0, [[BB1]] ], [ [[INDVARS_IV_NEXT267:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[T_0259:%.*]] = phi double [ 0.000000e+00, [[BB1]] ], [ [[ADD27:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[P3_ADDR_0258:%.*]] = phi double [ [[ADD]], [[BB1]] ], [ [[ADD28:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[VECINIT_I_I237:%.*]] = insertelement <2 x double> poison, double [[T_0259]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x double> [ [[TMP3]], [[BB1]] ], [ [[TMP7:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP4]], i32 0 +; CHECK-NEXT: [[VECINIT_I_I237:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i32 0 ; CHECK-NEXT: [[X13:%.*]] = tail call i32 @_xfn(<2 x double> [[VECINIT_I_I237]]) ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [256 x i32], [256 x i32]* [[TAB1]], i64 0, i64 [[INDVARS_IV266]] ; CHECK-NEXT: store i32 [[X13]], i32* [[ARRAYIDX]], align 4, !tbaa [[TBAA0:![0-9]+]] -; CHECK-NEXT: [[VECINIT_I_I:%.*]] = insertelement <2 x double> poison, double [[P3_ADDR_0258]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP4]], i32 1 +; CHECK-NEXT: [[VECINIT_I_I:%.*]] = insertelement <2 x double> poison, double [[TMP6]], i32 0 ; CHECK-NEXT: [[X14:%.*]] = tail call i32 @_xfn(<2 x double> [[VECINIT_I_I]]) ; CHECK-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds [256 x i32], [256 x i32]* [[TAB2]], i64 0, i64 [[INDVARS_IV266]] ; CHECK-NEXT: store i32 [[X14]], i32* [[ARRAYIDX26]], align 4, !tbaa [[TBAA0]] -; CHECK-NEXT: [[ADD27]] = fadd double [[MUL19]], [[T_0259]] -; CHECK-NEXT: [[ADD28]] = fadd double [[MUL21]], [[P3_ADDR_0258]] +; CHECK-NEXT: [[TMP7]] = fadd <2 x double> [[TMP2]], [[TMP4]] ; CHECK-NEXT: [[INDVARS_IV_NEXT267]] = add nuw nsw i64 [[INDVARS_IV266]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT267]], 256 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[RETURN:%.*]], label [[FOR_BODY]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_scheduling.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_scheduling.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_scheduling.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_scheduling.ll @@ -11,25 +11,27 @@ ; CHECK-NEXT: [[TAB2:%.*]] = alloca [256 x i32], align 16 ; CHECK-NEXT: br label [[BB1:%.*]] ; CHECK: bb1: -; CHECK-NEXT: [[MUL19:%.*]] = fmul double [[P1:%.*]], 1.638400e+04 ; CHECK-NEXT: [[MUL20:%.*]] = fmul double [[P3:%.*]], 1.638400e+04 ; CHECK-NEXT: [[ADD:%.*]] = fadd double [[MUL20]], 8.192000e+03 -; CHECK-NEXT: [[MUL21:%.*]] = fmul double [[P2:%.*]], 1.638400e+04 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[P1:%.*]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[P2:%.*]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> , double [[ADD]], i32 1 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV266:%.*]] = phi i64 [ 0, [[BB1]] ], [ [[INDVARS_IV_NEXT267:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[T_0259:%.*]] = phi double [ 0.000000e+00, [[BB1]] ], [ [[ADD27:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[P3_ADDR_0258:%.*]] = phi double [ [[ADD]], [[BB1]] ], [ [[ADD28:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[VECINIT_I_I237:%.*]] = insertelement <2 x double> undef, double [[T_0259]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x double> [ [[TMP3]], [[BB1]] ], [ [[TMP7:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP4]], i32 0 +; CHECK-NEXT: [[VECINIT_I_I237:%.*]] = insertelement <2 x double> undef, double [[TMP5]], i32 0 ; CHECK-NEXT: [[X13:%.*]] = tail call i32 @_xfn(<2 x double> [[VECINIT_I_I237]]) ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [256 x i32], [256 x i32]* [[TAB1]], i64 0, i64 [[INDVARS_IV266]] ; CHECK-NEXT: store i32 [[X13]], i32* [[ARRAYIDX]], align 4, !tbaa [[TBAA0:![0-9]+]] -; CHECK-NEXT: [[VECINIT_I_I:%.*]] = insertelement <2 x double> undef, double [[P3_ADDR_0258]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP4]], i32 1 +; CHECK-NEXT: [[VECINIT_I_I:%.*]] = insertelement <2 x double> undef, double [[TMP6]], i32 0 ; CHECK-NEXT: [[X14:%.*]] = tail call i32 @_xfn(<2 x double> [[VECINIT_I_I]]) ; CHECK-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds [256 x i32], [256 x i32]* [[TAB2]], i64 0, i64 [[INDVARS_IV266]] ; CHECK-NEXT: store i32 [[X14]], i32* [[ARRAYIDX26]], align 4, !tbaa [[TBAA0]] -; CHECK-NEXT: [[ADD27]] = fadd double [[MUL19]], [[T_0259]] -; CHECK-NEXT: [[ADD28]] = fadd double [[MUL21]], [[P3_ADDR_0258]] +; CHECK-NEXT: [[TMP7]] = fadd <2 x double> [[TMP2]], [[TMP4]] ; CHECK-NEXT: [[INDVARS_IV_NEXT267]] = add nuw nsw i64 [[INDVARS_IV266]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT267]], 256 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[RETURN:%.*]], label [[FOR_BODY]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extracts-with-undefs.ll b/llvm/test/Transforms/SLPVectorizer/X86/extracts-with-undefs.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/extracts-with-undefs.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extracts-with-undefs.ll @@ -6,28 +6,26 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[BODY:%.*]] ; CHECK: body: -; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x double> [ zeroinitializer, [[ENTRY:%.*]] ], [ zeroinitializer, [[BODY]] ] -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> , double [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <2 x double> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1 -; CHECK-NEXT: [[ADD8_I_I:%.*]] = fadd fast double [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[PHI1:%.*]] = phi double [ 0.000000e+00, [[ENTRY:%.*]] ], [ 0.000000e+00, [[BODY]] ] +; CHECK-NEXT: [[PHI2:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ 0.000000e+00, [[BODY]] ] +; CHECK-NEXT: [[MUL_I478_I:%.*]] = fmul fast double [[PHI1]], 0.000000e+00 +; CHECK-NEXT: [[MUL7_I485_I:%.*]] = fmul fast double undef, 0.000000e+00 +; CHECK-NEXT: [[ADD8_I_I:%.*]] = fadd fast double [[MUL_I478_I]], [[MUL7_I485_I]] ; CHECK-NEXT: [[CMP42_I:%.*]] = fcmp fast ole double [[ADD8_I_I]], 0.000000e+00 ; CHECK-NEXT: br i1 false, label [[BODY]], label [[EXIT:%.*]] ; CHECK: exit: ; CHECK-NEXT: br i1 false, label [[IF_THEN135_I:%.*]], label [[IF_END209_I:%.*]] ; CHECK: if.then135.i: -; CHECK-NEXT: [[TMP6:%.*]] = fcmp fast olt <2 x double> [[TMP0]], zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i1> , i1 [[TMP7]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = select <2 x i1> [[TMP8]], <2 x double> zeroinitializer, <2 x double> zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = fmul fast <2 x double> zeroinitializer, [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = fmul fast <2 x double> [[TMP10]], zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = fadd fast <2 x double> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[CMP145_I:%.*]] = fcmp fast olt double [[PHI1]], 0.000000e+00 +; CHECK-NEXT: [[CMP152_I:%.*]] = fcmp fast olt double [[PHI2]], 0.000000e+00 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i1> , i1 [[CMP152_I]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = select <2 x i1> [[TMP0]], <2 x double> zeroinitializer, <2 x double> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <2 x double> zeroinitializer, [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <2 x double> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = fadd fast <2 x double> [[TMP3]], zeroinitializer ; CHECK-NEXT: br label [[IF_END209_I]] ; CHECK: if.end209.i: -; CHECK-NEXT: [[TMP13:%.*]] = phi <2 x double> [ [[TMP12]], [[IF_THEN135_I]] ], [ zeroinitializer, [[EXIT]] ] +; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x double> [ [[TMP4]], [[IF_THEN135_I]] ], [ zeroinitializer, [[EXIT]] ] ; CHECK-NEXT: ret void ; entry: