diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -164,14 +164,13 @@ "slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores")); -// The maximum depth that the look-ahead score heuristic will explore -// when it probing among candidates for vectorization tree roots. -// The higher this value, the higher the compilation time overhead but unlike -// similar limit for operands ordering this is less frequently used, hence -// impact of higher value is less noticeable. -static cl::opt RootLookAheadMaxDepth( - "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, - cl::desc("The maximum look-ahead depth for searching best rooting option")); +// The maximum tree size that will use when probing among candidates for +// vectorization tree roots. The higher this value, the higher the compilation +// time overhead but unlike similar limit for operands ordering this is less +// frequently used, hence impact of higher value is less noticeable. +static cl::opt RootLookAheadMaxSize( + "slp-root-look-ahead-max-size", cl::init(5), cl::Hidden, + cl::desc("The maximum tree size for searching best rooting option")); static cl::opt ViewSLPTree("view-slp-tree", cl::Hidden, @@ -839,6 +838,15 @@ class BoUpSLP { struct TreeEntry; struct ScheduleData; + /// Limit the size of the SLP tree to this many nodes. + Optional MaxTreeSize; + /// Disables scheduling. This is to limit the compilation time used by + /// buildTree(), usually in combination with \p MaxTreeSize for quickly + /// building approximate trees that can be used for estimating which roots are + /// to be prefered. + /// WARNING: If this is enabled the tree is not guaranteed to contain valid + /// instruction bundles that can actually get codegened. + bool ForceDisableScheduling = false; public: using ValueList = SmallVector; @@ -897,6 +905,15 @@ void buildTree(ArrayRef Roots, ArrayRef UserIgnoreLst = None); + /// Builds a tree starting from \p roots with up to \p MaxSize nodes. For + /// faster compilation time scheduling is disabled by default, or can be + /// enabled with \p DisableScheduling = false. + /// This function is to be used for a look-ahead style evaluation of root + /// nodes and estimating which ones are worth building a full tree for. + InstructionCost buildTreeFastAndGetCost(ArrayRef Roots, + Optional MaxSize = 5, + bool DisableScheduling = true); + /// Builds external uses of the vectorized scalars, i.e. the list of /// vectorized scalars to be extracted, their lanes and their scalar users. \p /// ExternallyUsedValues contains additional list of external uses to handle @@ -906,6 +923,8 @@ /// Clear the internal data structures that are created by 'buildTree'. void deleteTree() { + ForceDisableScheduling = false; + MaxTreeSize = None; VectorizableTree.clear(); ScalarToTreeEntry.clear(); MustGather.clear(); @@ -2018,17 +2037,16 @@ /// above the LookAheadHeuristics::ScoreFail. Optional findBestRootPair(ArrayRef> Candidates) { - LookAheadHeuristics LookAhead(*DL, *SE, *this, /*NumLanes=*/2, - RootLookAheadMaxDepth); - int BestScore = LookAheadHeuristics::ScoreFail; + InstructionCost BestCost = InstructionCost::getMax(); Optional Index = None; + for (int I : seq(0, Candidates.size())) { - int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first, - Candidates[I].second, - /*U1=*/nullptr, /*U2=*/nullptr, - /*Level=*/1, None); - if (Score > BestScore) { - BestScore = Score; + SmallVector Roots( + {Candidates[I].first, Candidates[I].second}); + InstructionCost Cost = + buildTreeFastAndGetCost(Roots, RootLookAheadMaxSize.getValue()); + if (Cost < BestCost) { + BestCost = Cost; Index = I; } } @@ -2527,22 +2545,24 @@ assert(!getTreeEntry(V) && "Scalar already in tree!"); ScalarToTreeEntry[V] = Last; } - // Update the scheduler bundle to point to this TreeEntry. - ScheduleData *BundleMember = Bundle.getValue(); - assert((BundleMember || isa(S.MainOp) || - isVectorLikeInstWithConstOps(S.MainOp) || - doesNotNeedToSchedule(VL)) && - "Bundle and VL out of sync"); - if (BundleMember) { - for (Value *V : VL) { - if (doesNotNeedToBeScheduled(V)) - continue; - assert(BundleMember && "Unexpected end of bundle."); - BundleMember->TE = Last; - BundleMember = BundleMember->NextInBundle; + if (!ForceDisableScheduling) { + // Update the scheduler bundle to point to this TreeEntry. + ScheduleData *BundleMember = Bundle.getValue(); + assert((BundleMember || isa(S.MainOp) || + isVectorLikeInstWithConstOps(S.MainOp) || + doesNotNeedToSchedule(VL)) && + "Bundle and VL out of sync"); + if (BundleMember) { + for (Value *V : VL) { + if (doesNotNeedToBeScheduled(V)) + continue; + assert(BundleMember && "Unexpected end of bundle."); + BundleMember->TE = Last; + BundleMember = BundleMember->NextInBundle; + } } + assert(!BundleMember && "Bundle and VL out of sync"); } - assert(!BundleMember && "Bundle and VL out of sync"); } else { MustGather.insert(VL.begin(), VL.end()); } @@ -2920,6 +2940,7 @@ /// instructions into the ready-list. template void schedule(ScheduleData *SD, ReadyListType &ReadyList) { + assert(!DisableScheduling && "Trying to schedule when disabled"); SD->IsScheduled = true; LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n"); @@ -3079,6 +3100,14 @@ tryScheduleBundle(ArrayRef VL, BoUpSLP *SLP, const InstructionsState &S); + /// \Returns an uninitialized ScheduleBundle. This is needed when disabling + /// the scheduler because a non-null ScheduleData bundle is used to + /// determine whether a TreeEntry is marked as vectorizable. + static ScheduleData *getDummyBundle() { + static ScheduleData SD; + return &SD; + } + /// Un-bundles a group of instructions. void cancelScheduling(ArrayRef VL, Value *OpValue); @@ -3157,6 +3186,9 @@ /// Make sure that the initial SchedulingRegionID is greater than the /// initial SchedulingRegionID in ScheduleData (which is 0). int SchedulingRegionID = 1; + + /// This is set to true if we need to skip scheduling for this block. + bool DisableScheduling = false; }; /// Attaches the BlockScheduling structures to basic blocks. @@ -4087,6 +4119,26 @@ buildTree_rec(Roots, 0, EdgeInfo()); } +InstructionCost BoUpSLP::buildTreeFastAndGetCost(ArrayRef Roots, + Optional MaxSize, + bool DisableScheduling) { + deleteTree(); + MaxTreeSize = MaxSize; + ForceDisableScheduling = DisableScheduling; + if (!allSameType(Roots)) + return InstructionCost::getMax(); + + buildTree_rec(Roots, 0, EdgeInfo()); + + if (isTreeTinyAndNotFullyVectorizable()) + return InstructionCost::getMax(); + reorderTopToBottom(); + reorderBottomToTop(!isa(Roots.front())); + buildExternalUses(); + computeMinimumValueSizes(); + return getTreeCost(); +} + namespace { /// Tracks the state we can represent the loads in the given sequence. enum class LoadsState { Gather, Vectorize, ScatterVectorize }; @@ -4249,6 +4301,13 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, const EdgeInfo &UserTreeIdx) { + // If we are building a fast approximate tree, then early return once we reach + // the tree size limit. + if (MaxTreeSize && VectorizableTree.size() >= *MaxTreeSize) { + LLVM_DEBUG(dbgs() << "SLP: Reached max tree size " << *MaxTreeSize + << ".\n"); + return; + } assert((allConstant(VL) || allSameType(VL)) && "Invalid types!"); SmallVector ReuseShuffleIndicies; @@ -4418,13 +4477,16 @@ BSRef = std::make_unique(BB); BlockScheduling &BS = *BSRef; + BS.DisableScheduling = ForceDisableScheduling; - Optional Bundle = BS.tryScheduleBundle(VL, this, S); + Optional Bundle = !ForceDisableScheduling + ? BS.tryScheduleBundle(VL, this, S) + : BlockScheduling::getDummyBundle(); #ifdef EXPENSIVE_CHECKS // Make sure we didn't break any internal invariants BS.verify(); #endif - if (!Bundle) { + if (!ForceDisableScheduling && !Bundle) { LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n"); assert((!BS.getScheduleData(VL0) || !BS.getScheduleData(VL0)->isPartOfBundle()) && @@ -4433,7 +4495,9 @@ ReuseShuffleIndicies); return; } - LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); + LLVM_DEBUG(dbgs() << (ForceDisableScheduling + ? "SLP: Scheduling was disabled.\n" + : "SLP: We are able to schedule this bundle.\n")); unsigned ShuffleOrOp = S.isAltShuffle() ? (unsigned) Instruction::ShuffleVector : S.getOpcode(); @@ -8075,6 +8139,7 @@ Optional BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef VL, BoUpSLP *SLP, const InstructionsState &S) { + assert(!DisableScheduling && "Trying to schedule when disabled"); // No need to schedule PHIs, insertelement, extractelement and extractvalue // instructions. if (isa(S.OpValue) || isVectorLikeInstWithConstOps(S.OpValue) || @@ -8171,6 +8236,9 @@ void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef VL, Value *OpValue) { + // Early return if scheduling is disabled. + if (DisableScheduling) + return; if (isa(OpValue) || isVectorLikeInstWithConstOps(OpValue) || doesNotNeedToSchedule(VL)) return; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll @@ -157,6 +157,8 @@ ; MINTREESIZE-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[Q4]], i32 0 ; MINTREESIZE-NEXT: [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[Q5]], i32 1 ; MINTREESIZE-NEXT: [[Q6:%.*]] = fadd float [[Q4]], [[Q5]] +; MINTREESIZE-NEXT: [[TMP11:%.*]] = insertelement <2 x float> poison, float [[Q6]], i32 0 +; MINTREESIZE-NEXT: [[TMP12:%.*]] = insertelement <2 x float> [[TMP11]], float [[Q5]], i32 1 ; MINTREESIZE-NEXT: [[QI:%.*]] = fcmp olt float [[Q6]], [[Q5]] ; MINTREESIZE-NEXT: call void @llvm.assume(i1 [[QI]]) ; MINTREESIZE-NEXT: ret <4 x float> undef diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll @@ -192,6 +192,8 @@ ; MINTREESIZE-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[Q4]], i32 0 ; MINTREESIZE-NEXT: [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[Q5]], i32 1 ; MINTREESIZE-NEXT: [[Q6:%.*]] = fadd float [[Q4]], [[Q5]] +; MINTREESIZE-NEXT: [[TMP11:%.*]] = insertelement <2 x float> poison, float [[Q6]], i32 0 +; MINTREESIZE-NEXT: [[TMP12:%.*]] = insertelement <2 x float> [[TMP11]], float [[Q5]], i32 1 ; MINTREESIZE-NEXT: [[QI:%.*]] = fcmp olt float [[Q6]], [[Q5]] ; MINTREESIZE-NEXT: call void @llvm.assume(i1 [[QI]]) ; MINTREESIZE-NEXT: ret <4 x float> undef diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr48879-sroa.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr48879-sroa.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr48879-sroa.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr48879-sroa.ll @@ -76,11 +76,11 @@ ; AVX-NEXT: [[TMP2:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP0]], i16 [[TMP1]]) ; AVX-NEXT: [[ARRAYIDX_I_I_1:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 1 ; AVX-NEXT: [[ARRAYIDX_I_I10_1:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 1 -; AVX-NEXT: [[TMP3:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_1]], align 2 -; AVX-NEXT: [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_I_I_1]], align 2 +; AVX-NEXT: [[ARRAYIDX_I_I_3:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 3 +; AVX-NEXT: [[ARRAYIDX_I_I10_3:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 3 +; AVX-NEXT: [[TMP3:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_3]], align 2 +; AVX-NEXT: [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_I_I_3]], align 2 ; AVX-NEXT: [[TMP5:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP3]], i16 [[TMP4]]) -; AVX-NEXT: [[ARRAYIDX_I_I_2:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 2 -; AVX-NEXT: [[ARRAYIDX_I_I10_2:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 2 ; AVX-NEXT: [[ARRAYIDX_I_I_4:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 4 ; AVX-NEXT: [[ARRAYIDX_I_I10_4:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 4 ; AVX-NEXT: [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_4]], align 2 @@ -88,36 +88,36 @@ ; AVX-NEXT: [[TMP8:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP6]], i16 [[TMP7]]) ; AVX-NEXT: [[ARRAYIDX_I_I_5:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 5 ; AVX-NEXT: [[ARRAYIDX_I_I10_5:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 5 -; AVX-NEXT: [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_5]], align 2 -; AVX-NEXT: [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX_I_I_5]], align 2 +; AVX-NEXT: [[ARRAYIDX_I_I_7:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 7 +; AVX-NEXT: [[ARRAYIDX_I_I10_7:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 7 +; AVX-NEXT: [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_7]], align 2 +; AVX-NEXT: [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX_I_I_7]], align 2 ; AVX-NEXT: [[TMP11:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP9]], i16 [[TMP10]]) -; AVX-NEXT: [[ARRAYIDX_I_I_6:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 6 -; AVX-NEXT: [[ARRAYIDX_I_I10_6:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 6 -; AVX-NEXT: [[TMP12:%.*]] = load <2 x i16>, ptr [[ARRAYIDX_I_I10_2]], align 2 -; AVX-NEXT: [[TMP13:%.*]] = load <2 x i16>, ptr [[ARRAYIDX_I_I_2]], align 2 +; AVX-NEXT: [[RETVAL_SROA_4_0_INSERT_EXT:%.*]] = zext i16 [[TMP5]] to i64 +; AVX-NEXT: [[RETVAL_SROA_4_0_INSERT_SHIFT:%.*]] = shl nuw i64 [[RETVAL_SROA_4_0_INSERT_EXT]], 48 +; AVX-NEXT: [[TMP12:%.*]] = load <2 x i16>, ptr [[ARRAYIDX_I_I10_1]], align 2 +; AVX-NEXT: [[TMP13:%.*]] = load <2 x i16>, ptr [[ARRAYIDX_I_I_1]], align 2 ; AVX-NEXT: [[TMP14:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP12]], <2 x i16> [[TMP13]]) ; AVX-NEXT: [[TMP15:%.*]] = zext <2 x i16> [[TMP14]] to <2 x i64> -; AVX-NEXT: [[TMP16:%.*]] = shl nuw <2 x i64> [[TMP15]], -; AVX-NEXT: [[TMP17:%.*]] = extractelement <2 x i64> [[TMP16]], i32 0 -; AVX-NEXT: [[TMP18:%.*]] = extractelement <2 x i64> [[TMP16]], i32 1 -; AVX-NEXT: [[RETVAL_SROA_3_0_INSERT_INSERT:%.*]] = or i64 [[TMP18]], [[TMP17]] -; AVX-NEXT: [[RETVAL_SROA_2_0_INSERT_EXT:%.*]] = zext i16 [[TMP5]] to i64 -; AVX-NEXT: [[RETVAL_SROA_2_0_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_2_0_INSERT_EXT]], 16 -; AVX-NEXT: [[RETVAL_SROA_2_0_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_3_0_INSERT_INSERT]], [[RETVAL_SROA_2_0_INSERT_SHIFT]] +; AVX-NEXT: [[TMP16:%.*]] = shl nuw nsw <2 x i64> [[TMP15]], +; AVX-NEXT: [[TMP17:%.*]] = extractelement <2 x i64> [[TMP16]], i32 1 +; AVX-NEXT: [[RETVAL_SROA_3_0_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_4_0_INSERT_SHIFT]], [[TMP17]] +; AVX-NEXT: [[TMP18:%.*]] = extractelement <2 x i64> [[TMP16]], i32 0 +; AVX-NEXT: [[RETVAL_SROA_2_0_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_3_0_INSERT_INSERT]], [[TMP18]] ; AVX-NEXT: [[RETVAL_SROA_0_0_INSERT_EXT:%.*]] = zext i16 [[TMP2]] to i64 ; AVX-NEXT: [[RETVAL_SROA_0_0_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_2_0_INSERT_INSERT]], [[RETVAL_SROA_0_0_INSERT_EXT]] ; AVX-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[RETVAL_SROA_0_0_INSERT_INSERT]], 0 -; AVX-NEXT: [[TMP19:%.*]] = load <2 x i16>, ptr [[ARRAYIDX_I_I10_6]], align 2 -; AVX-NEXT: [[TMP20:%.*]] = load <2 x i16>, ptr [[ARRAYIDX_I_I_6]], align 2 +; AVX-NEXT: [[RETVAL_SROA_9_8_INSERT_EXT:%.*]] = zext i16 [[TMP11]] to i64 +; AVX-NEXT: [[RETVAL_SROA_9_8_INSERT_SHIFT:%.*]] = shl nuw i64 [[RETVAL_SROA_9_8_INSERT_EXT]], 48 +; AVX-NEXT: [[TMP19:%.*]] = load <2 x i16>, ptr [[ARRAYIDX_I_I10_5]], align 2 +; AVX-NEXT: [[TMP20:%.*]] = load <2 x i16>, ptr [[ARRAYIDX_I_I_5]], align 2 ; AVX-NEXT: [[TMP21:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP19]], <2 x i16> [[TMP20]]) ; AVX-NEXT: [[TMP22:%.*]] = zext <2 x i16> [[TMP21]] to <2 x i64> -; AVX-NEXT: [[TMP23:%.*]] = shl nuw <2 x i64> [[TMP22]], -; AVX-NEXT: [[TMP24:%.*]] = extractelement <2 x i64> [[TMP23]], i32 0 -; AVX-NEXT: [[TMP25:%.*]] = extractelement <2 x i64> [[TMP23]], i32 1 -; AVX-NEXT: [[RETVAL_SROA_8_8_INSERT_INSERT:%.*]] = or i64 [[TMP25]], [[TMP24]] -; AVX-NEXT: [[RETVAL_SROA_7_8_INSERT_EXT:%.*]] = zext i16 [[TMP11]] to i64 -; AVX-NEXT: [[RETVAL_SROA_7_8_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_7_8_INSERT_EXT]], 16 -; AVX-NEXT: [[RETVAL_SROA_7_8_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_8_8_INSERT_INSERT]], [[RETVAL_SROA_7_8_INSERT_SHIFT]] +; AVX-NEXT: [[TMP23:%.*]] = shl nuw nsw <2 x i64> [[TMP22]], +; AVX-NEXT: [[TMP24:%.*]] = extractelement <2 x i64> [[TMP23]], i32 1 +; AVX-NEXT: [[RETVAL_SROA_8_8_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_9_8_INSERT_SHIFT]], [[TMP24]] +; AVX-NEXT: [[TMP25:%.*]] = extractelement <2 x i64> [[TMP23]], i32 0 +; AVX-NEXT: [[RETVAL_SROA_7_8_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_8_8_INSERT_INSERT]], [[TMP25]] ; AVX-NEXT: [[RETVAL_SROA_5_8_INSERT_EXT:%.*]] = zext i16 [[TMP8]] to i64 ; AVX-NEXT: [[RETVAL_SROA_5_8_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_7_8_INSERT_INSERT]], [[RETVAL_SROA_5_8_INSERT_EXT]] ; AVX-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[RETVAL_SROA_5_8_INSERT_INSERT]], 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-pair-path.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-pair-path.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-pair-path.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-pair-path.ll @@ -61,18 +61,19 @@ define void @root_steering() { ; CHECK-LABEL: @root_steering( ; CHECK-NEXT: bb: -; CHECK-NEXT: [[CHAIN2_2:%.*]] = fadd double 4.000000e-01, 5.000000e-01 -; CHECK-NEXT: [[CHAIN2_1:%.*]] = fmul double 3.000000e-01, [[CHAIN2_2]] -; CHECK-NEXT: [[ROOT5:%.*]] = fadd double 2.000000e-01, [[CHAIN2_1]] ; CHECK-NEXT: [[ROOT3:%.*]] = fmul double 3.000000e-01, 2.000000e-01 ; CHECK-NEXT: [[MUL:%.*]] = fmul double [[ROOT3]], 1.000000e-01 ; CHECK-NEXT: [[CHAINB_3:%.*]] = fadd double 3.000000e-01, 4.000000e-01 ; CHECK-NEXT: [[CHAINB_2:%.*]] = fmul double 2.000000e-01, [[CHAINB_3]] -; CHECK-NEXT: [[CHAINB_1:%.*]] = fadd double 1.000000e-01, [[CHAINB_2]] -; CHECK-NEXT: [[ROOT4:%.*]] = fmul double [[MUL]], [[CHAINB_1]] -; CHECK-NEXT: [[ROOT2:%.*]] = fadd double 1.000000e-01, [[ROOT4]] -; CHECK-NEXT: [[ROOT1:%.*]] = fmul double [[ROOT3]], [[ROOT5]] -; CHECK-NEXT: [[DIV:%.*]] = fdiv double [[ROOT1]], [[ROOT2]] +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> , double [[CHAINB_2]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = fadd <2 x double> , [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> , double [[MUL]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP2]], [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> , [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP4]], i32 0 +; CHECK-NEXT: [[ROOT1:%.*]] = fmul double [[ROOT3]], [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP4]], i32 1 +; CHECK-NEXT: [[DIV:%.*]] = fdiv double [[ROOT1]], [[TMP6]] ; CHECK-NEXT: [[SEED:%.*]] = fcmp ogt double [[DIV]], 3.000000e-01 ; CHECK-NEXT: ret void ;