diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -14321,78 +14321,42 @@ template static bool tryToVectorizeSequence( - SmallVectorImpl &Incoming, function_ref Comparator, + SmallVectorImpl &UnsortedSeeds, + function_ref Comparator, function_ref AreCompatible, - function_ref, bool)> TryToVectorizeHelper, - bool MaxVFOnly, BoUpSLP &R) { + function_ref, bool)> TryToVectorizeHelper, BoUpSLP &R) { + // We separate the seeds into groups based on their types. + MapVector> SeedsMap; + for (T *Seed : UnsortedSeeds) + SeedsMap[Seed->getType()].push_back(Seed); + // Sort the seeds that correspond to each type. + for (auto &Pair : SeedsMap) + stable_sort(Pair.second, Comparator); + // For all types try to vectorize the seed vector. bool Changed = false; - // Sort by type, parent, operands. - stable_sort(Incoming, Comparator); - - // Try to vectorize elements base on their type. - SmallVector Candidates; - for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;) { - // Look for the next elements with the same type, parent and operand - // kinds. - auto *SameTypeIt = IncIt; - while (SameTypeIt != E && AreCompatible(*SameTypeIt, *IncIt)) - ++SameTypeIt; - - // Try to vectorize them. - unsigned NumElts = (SameTypeIt - IncIt); - LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes (" - << NumElts << ")\n"); - // The vectorization is a 3-state attempt: - // 1. Try to vectorize instructions with the same/alternate opcodes with the - // size of maximal register at first. - // 2. Try to vectorize remaining instructions with the same type, if - // possible. This may result in the better vectorization results rather than - // if we try just to vectorize instructions with the same/alternate opcodes. - // 3. Final attempt to try to vectorize all instructions with the - // same/alternate ops only, this may result in some extra final - // vectorization. - if (NumElts > 1 && - TryToVectorizeHelper(ArrayRef(IncIt, NumElts), MaxVFOnly)) { - // Success start over because instructions might have been changed. + for (auto &Pair : SeedsMap) { + auto &Seeds = Pair.second; + if (Seeds.empty()) + continue; + // Try the full Seeds vector first. + if (TryToVectorizeHelper(Seeds, /*MaxVFOnly=*/false)) { Changed = true; - } else { - /// \Returns the minimum number of elements that we will attempt to - /// vectorize. - auto GetMinNumElements = [&R](Value *V) { - unsigned EltSize = R.getVectorElementSize(V); - return std::max(2U, R.getMaxVecRegSize() / EltSize); - }; - if (NumElts < GetMinNumElements(*IncIt) && - (Candidates.empty() || - Candidates.front()->getType() == (*IncIt)->getType())) { - Candidates.append(IncIt, std::next(IncIt, NumElts)); - } + continue; } - // Final attempt to vectorize instructions with the same types. - if (Candidates.size() > 1 && - (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) { - if (TryToVectorizeHelper(Candidates, /*MaxVFOnly=*/false)) { - // Success start over because instructions might have been changed. + // If the full vector failed, slice the Seeds according to `AreCompatible()` + uint32_t SliceSz; + for (auto SliceBegin = Seeds.begin(), ItE = Seeds.end(); SliceBegin != ItE; + SliceBegin += SliceSz) { + auto SliceEnd = std::next(SliceBegin); + while (SliceEnd != ItE && AreCompatible(*SliceEnd, *std::prev(SliceEnd))) + ++SliceEnd; + SliceSz = SliceEnd - SliceBegin; + if (SliceSz <= 1) + continue; + ArrayRef SeedsSlice(SliceBegin, SliceEnd); + if (TryToVectorizeHelper(SeedsSlice, /*MaxVFOnly=*/false)) Changed = true; - } else if (MaxVFOnly) { - // Try to vectorize using small vectors. - for (auto *It = Candidates.begin(), *End = Candidates.end(); - It != End;) { - auto *SameTypeIt = It; - while (SameTypeIt != End && AreCompatible(*SameTypeIt, *It)) - ++SameTypeIt; - unsigned NumElts = (SameTypeIt - It); - if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(It, NumElts), - /*MaxVFOnly=*/false)) - Changed = true; - It = SameTypeIt; - } - } - Candidates.clear(); } - - // Start over at the next instruction of a different type (or the end). - IncIt = SameTypeIt; } return Changed; } @@ -14498,7 +14462,7 @@ return false; return tryToVectorizeList(Candidates, R, MaxVFOnly); }, - /*MaxVFOnly=*/true, R); + R); return Changed; } @@ -14671,7 +14635,7 @@ [this, &R](ArrayRef Candidates, bool MaxVFOnly) { return tryToVectorizeList(Candidates, R, MaxVFOnly); }, - /*MaxVFOnly=*/true, R); + R); Changed |= HaveVectorizedPhiNodes; VisitedInstrs.insert(Incoming.begin(), Incoming.end()); } while (HaveVectorizedPhiNodes); @@ -14984,7 +14948,7 @@ [this, &R](ArrayRef Candidates, bool) { return vectorizeStores(Candidates, R); }, - /*MaxVFOnly=*/false, R); + R); } return Changed; } diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll @@ -3,27 +3,22 @@ define void @test(<2 x i64> %0, <2 x i64> %1, <2 x i64> %2) { ; CHECK-LABEL: @test( -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP1:%.*]], i64 0 -; CHECK-NEXT: [[TMP5:%.*]] = or i64 [[TMP4]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP0:%.*]], i64 0 -; CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP7]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = trunc i64 [[TMP8]] to i32 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP2:%.*]], i64 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1 -; CHECK-NEXT: [[TMP12:%.*]] = or i64 [[TMP10]], [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i32 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP0]], i64 0 -; CHECK-NEXT: [[TMP15:%.*]] = or i64 [[TMP14]], 0 -; CHECK-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32 -; CHECK-NEXT: br label [[TMP17:%.*]] -; CHECK: 17: -; CHECK-NEXT: [[TMP18:%.*]] = phi i32 [ [[TMP22:%.*]], [[TMP17]] ], [ [[TMP6]], [[TMP3:%.*]] ] -; CHECK-NEXT: [[TMP19:%.*]] = phi i32 [ 0, [[TMP17]] ], [ [[TMP9]], [[TMP3]] ] -; CHECK-NEXT: [[TMP20:%.*]] = phi i32 [ 0, [[TMP17]] ], [ [[TMP13]], [[TMP3]] ] -; CHECK-NEXT: [[TMP21:%.*]] = phi i32 [ 0, [[TMP17]] ], [ [[TMP16]], [[TMP3]] ] -; CHECK-NEXT: [[TMP22]] = or i32 [[TMP18]], 0 -; CHECK-NEXT: br label [[TMP17]] +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP1:%.*]], <2 x i64> [[TMP0:%.*]], <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = or <2 x i64> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = trunc <2 x i64> [[TMP5]] to <2 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP2:%.*]], <2 x i64> [[TMP0]], <2 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> , <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = or <2 x i64> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = trunc <2 x i64> [[TMP9]] to <2 x i32> +; CHECK-NEXT: br label [[TMP11:%.*]] +; CHECK: 11: +; CHECK-NEXT: [[TMP12:%.*]] = phi <2 x i32> [ [[TMP17:%.*]], [[TMP11]] ], [ [[TMP6]], [[TMP3:%.*]] ] +; CHECK-NEXT: [[TMP13:%.*]] = phi <2 x i32> [ zeroinitializer, [[TMP11]] ], [ [[TMP10]], [[TMP3]] ] +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <2 x i32> [[TMP12]], <2 x i32> , <2 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = or <2 x i32> zeroinitializer, [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = add <2 x i32> zeroinitializer, [[TMP14]] +; CHECK-NEXT: [[TMP17]] = shufflevector <2 x i32> [[TMP15]], <2 x i32> [[TMP16]], <2 x i32> +; CHECK-NEXT: br label [[TMP11]] ; %4 = extractelement <2 x i64> %1, i64 0 %5 = or i64 %4, 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/landing_pad.ll b/llvm/test/Transforms/SLPVectorizer/X86/landing_pad.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/landing_pad.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/landing_pad.ll @@ -10,7 +10,7 @@ ; CHECK: bb2.loopexit: ; CHECK-NEXT: br label [[BB2:%.*]] ; CHECK: bb2: -; CHECK-NEXT: [[TMP0:%.*]] = phi <4 x i32> [ [[TMP8:%.*]], [[BB9:%.*]] ], [ poison, [[BB2_LOOPEXIT:%.*]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi <4 x i32> [ [[TMP7:%.*]], [[BB9:%.*]] ], [ poison, [[BB2_LOOPEXIT:%.*]] ] ; CHECK-NEXT: ret void ; CHECK: bb3: ; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP3:%.*]], [[BB6:%.*]] ], [ poison, [[BB1:%.*]] ] @@ -32,19 +32,18 @@ ; CHECK-NEXT: br i1 poison, label [[BB7]], label [[BB6]] ; CHECK: bb9: ; CHECK-NEXT: [[INDVARS_IV528799:%.*]] = phi i64 [ poison, [[BB10]] ], [ poison, [[BB12]] ] -; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x i32> [ [[TMP9:%.*]], [[BB10]] ], [ [[TMP10:%.*]], [[BB12]] ] -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP8]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x i32> [ [[TMP8:%.*]], [[BB10]] ], [ [[TMP9:%.*]], [[BB12]] ] +; CHECK-NEXT: [[TMP7]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: br label [[BB2]] ; CHECK: bb10: -; CHECK-NEXT: [[TMP9]] = phi <2 x i32> [ [[TMP1]], [[BB3]] ] +; CHECK-NEXT: [[TMP8]] = phi <2 x i32> [ [[TMP1]], [[BB3]] ] ; CHECK-NEXT: [[LANDING_PAD68:%.*]] = landingpad { ptr, i32 } ; CHECK-NEXT: cleanup ; CHECK-NEXT: br label [[BB9]] ; CHECK: bb11: ; CHECK-NEXT: ret void ; CHECK: bb12: -; CHECK-NEXT: [[TMP10]] = phi <2 x i32> [ [[TMP4]], [[BB7]] ] +; CHECK-NEXT: [[TMP9]] = phi <2 x i32> [ [[TMP4]], [[BB7]] ] ; CHECK-NEXT: [[LANDING_PAD149:%.*]] = landingpad { ptr, i32 } ; CHECK-NEXT: cleanup ; CHECK-NEXT: br label [[BB9]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll @@ -7,46 +7,42 @@ ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i64> poison, i64 [[P0:%.*]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> [[TMP0]], i64 [[P1:%.*]], i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = add <2 x i64> [[TMP1]], [[TMP1]] -; CHECK-NEXT: [[A2:%.*]] = add i64 [[P2:%.*]], [[P2]] -; CHECK-NEXT: [[A3:%.*]] = add i64 [[P3:%.*]], [[P3]] -; CHECK-NEXT: [[TMP3:%.*]] = mul <2 x i64> [[TMP1]], [[TMP1]] -; CHECK-NEXT: [[M2:%.*]] = mul i64 [[P2]], [[P2]] -; CHECK-NEXT: [[M3:%.*]] = mul i64 [[P3]], [[P3]] -; CHECK-NEXT: [[TMP4:%.*]] = sdiv <2 x i64> [[TMP1]], [[TMP1]] -; CHECK-NEXT: [[D2:%.*]] = sdiv i64 [[P2]], [[P2]] -; CHECK-NEXT: [[D3:%.*]] = sdiv i64 [[P3]], [[P3]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; CHECK-NEXT: [[S0:%.*]] = sub i64 [[TMP6]], [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; CHECK-NEXT: [[S1:%.*]] = sub i64 [[TMP8]], [[TMP7]] -; CHECK-NEXT: [[S2:%.*]] = sub i64 [[M2]], [[D2]] -; CHECK-NEXT: [[S3:%.*]] = sub i64 [[M3]], [[D3]] -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0 -; CHECK-NEXT: [[SHL1:%.*]] = shl i64 [[TMP9]], [[S0]] -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 -; CHECK-NEXT: [[SHL2:%.*]] = shl i64 [[TMP10]], [[S1]] -; CHECK-NEXT: [[SHL3:%.*]] = shl i64 [[A2]], [[S2]] -; CHECK-NEXT: [[SHL4:%.*]] = shl i64 [[A3]], [[S3]] -; CHECK-NEXT: [[O0:%.*]] = or i64 [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[P2:%.*]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[P3:%.*]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = mul <2 x i64> [[TMP1]], [[TMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = mul <2 x i64> [[TMP4]], [[TMP4]] +; CHECK-NEXT: [[D0:%.*]] = sdiv i64 [[P0]], [[P0]] +; CHECK-NEXT: [[D1:%.*]] = sdiv i64 [[P1]], [[P1]] +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 +; CHECK-NEXT: [[S0:%.*]] = sub i64 [[TMP8]], [[D0]] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 +; CHECK-NEXT: [[S1:%.*]] = sub i64 [[TMP9]], [[D1]] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0 +; CHECK-NEXT: [[SHL1:%.*]] = shl i64 [[TMP10]], [[S0]] +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 +; CHECK-NEXT: [[SHL2:%.*]] = shl i64 [[TMP11]], [[S1]] +; CHECK-NEXT: [[O0:%.*]] = or i64 [[TMP10]], [[TMP11]] ; CHECK-NEXT: [[TT0:%.*]] = trunc i64 [[O0]] to i32 -; CHECK-NEXT: [[O1:%.*]] = or i64 [[TMP6]], [[TMP8]] +; CHECK-NEXT: [[O1:%.*]] = or i64 [[TMP8]], [[TMP9]] ; CHECK-NEXT: [[TT1:%.*]] = trunc i64 [[O1]] to i32 -; CHECK-NEXT: [[O2:%.*]] = or i64 [[TMP5]], [[TMP7]] -; CHECK-NEXT: [[TT2:%.*]] = trunc i64 [[O2]] to i32 -; CHECK-NEXT: [[O3:%.*]] = or i64 [[TMP6]], [[TMP8]] -; CHECK-NEXT: [[TT3:%.*]] = trunc i64 [[O3]] to i32 +; CHECK-NEXT: [[TMP12:%.*]] = sdiv <2 x i64> [[TMP4]], [[TMP4]] +; CHECK-NEXT: [[TMP13:%.*]] = sub <2 x i64> [[TMP7]], [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = shl <2 x i64> [[TMP5]], [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x i64> poison, i64 [[D0]], i32 0 +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <2 x i64> [[TMP15]], <2 x i64> [[TMP6]], <2 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <2 x i64> poison, i64 [[D1]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <2 x i64> [[TMP17]], <2 x i64> [[TMP6]], <2 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = or <2 x i64> [[TMP16]], [[TMP18]] +; CHECK-NEXT: [[TMP20:%.*]] = trunc <2 x i64> [[TMP19]] to <2 x i32> ; CHECK-NEXT: br label [[BB:%.*]] ; CHECK: bb: ; CHECK-NEXT: [[PHI0:%.*]] = phi i32 [ [[T1:%.*]], [[BB]] ], [ [[TT0]], [[ENTRY:%.*]] ] ; CHECK-NEXT: [[PHI1:%.*]] = phi i32 [ [[T2:%.*]], [[BB]] ], [ [[TT1]], [[ENTRY]] ] -; CHECK-NEXT: [[PHI2:%.*]] = phi i32 [ [[T3:%.*]], [[BB]] ], [ [[TT2]], [[ENTRY]] ] -; CHECK-NEXT: [[PHI3:%.*]] = phi i32 [ [[T4:%.*]], [[BB]] ], [ [[TT3]], [[ENTRY]] ] +; CHECK-NEXT: [[TMP21:%.*]] = phi <2 x i32> [ [[TMP22:%.*]], [[BB]] ], [ [[TMP20]], [[ENTRY]] ] ; CHECK-NEXT: [[T1]] = trunc i64 [[SHL1]] to i32 ; CHECK-NEXT: [[T2]] = trunc i64 [[SHL2]] to i32 -; CHECK-NEXT: [[T3]] = trunc i64 [[SHL3]] to i32 -; CHECK-NEXT: [[T4]] = trunc i64 [[SHL4]] to i32 +; CHECK-NEXT: [[TMP22]] = trunc <2 x i64> [[TMP14]] to <2 x i32> ; CHECK-NEXT: br label [[BB]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll @@ -23,40 +23,40 @@ define float @foo(ptr nocapture readonly %A) { ; CHECK-LABEL: @foo( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[A:%.*]], align 4 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 2 -; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP0]], i32 0 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[TMP4:%.*]] = phi float [ [[TMP3]], [[ENTRY:%.*]] ], [ [[DOTPRE:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE:%.*]] ] +; CHECK-NEXT: [[TMP3:%.*]] = phi float [ [[TMP2]], [[ENTRY:%.*]] ], [ [[DOTPRE:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE:%.*]] ] ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] -; CHECK-NEXT: [[B_032:%.*]] = phi float [ [[TMP2]], [[ENTRY]] ], [ [[ADD14:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] -; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x float> [ [[TMP1]], [[ENTRY]] ], [ [[TMP11:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[ARRAYIDX7]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x float> poison, float [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x float> [[TMP8]], float [[TMP7]], i32 1 -; CHECK-NEXT: [[TMP10:%.*]] = fmul <2 x float> [[TMP9]], -; CHECK-NEXT: [[TMP11]] = fadd <2 x float> [[TMP5]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = add nsw i64 [[INDVARS_IV]], 2 -; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP13:%.*]] = load float, ptr [[ARRAYIDX12]], align 4 -; CHECK-NEXT: [[MUL13:%.*]] = fmul float [[TMP13]], 9.000000e+00 +; CHECK-NEXT: [[B_032:%.*]] = phi float [ [[TMP1]], [[ENTRY]] ], [ [[ADD14:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x float> [ [[TMP0]], [[ENTRY]] ], [ [[TMP10:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] +; CHECK-NEXT: [[TMP5:%.*]] = add nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[ARRAYIDX7]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[TMP6]], i32 1 +; CHECK-NEXT: [[TMP9:%.*]] = fmul <2 x float> [[TMP8]], +; CHECK-NEXT: [[TMP10]] = fadd <2 x float> [[TMP4]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = add nsw i64 [[INDVARS_IV]], 2 +; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP12:%.*]] = load float, ptr [[ARRAYIDX12]], align 4 +; CHECK-NEXT: [[MUL13:%.*]] = fmul float [[TMP12]], 9.000000e+00 ; CHECK-NEXT: [[ADD14]] = fadd float [[B_032]], [[MUL13]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 3 -; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP14]], 121 +; CHECK-NEXT: [[TMP13:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP13]], 121 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY_FOR_BODY_CRIT_EDGE]], label [[FOR_END:%.*]] ; CHECK: for.body.for.body_crit_edge: ; CHECK-NEXT: [[ARRAYIDX3_PHI_TRANS_INSERT:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV_NEXT]] ; CHECK-NEXT: [[DOTPRE]] = load float, ptr [[ARRAYIDX3_PHI_TRANS_INSERT]], align 4 ; CHECK-NEXT: br label [[FOR_BODY]] ; CHECK: for.end: -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x float> [[TMP11]], i32 0 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[TMP11]], i32 1 -; CHECK-NEXT: [[ADD16:%.*]] = fadd float [[TMP15]], [[TMP16]] +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x float> [[TMP10]], i32 0 +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x float> [[TMP10]], i32 1 +; CHECK-NEXT: [[ADD16:%.*]] = fadd float [[TMP14]], [[TMP15]] ; CHECK-NEXT: [[ADD17:%.*]] = fadd float [[ADD16]], [[ADD14]] ; CHECK-NEXT: ret float [[ADD17]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll b/llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll --- a/llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll +++ b/llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll @@ -137,23 +137,23 @@ ; MAX256-NEXT: [[I6:%.*]] = fpext half [[HVAL]] to float ; MAX256-NEXT: [[I9:%.*]] = fpext half [[HVAL]] to float ; MAX256-NEXT: [[TMP0:%.*]] = insertelement <8 x float> poison, float [[I]], i32 0 -; MAX256-NEXT: [[SHUFFLE11:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <8 x i32> zeroinitializer -; MAX256-NEXT: [[TMP1:%.*]] = insertelement <8 x float> poison, float [[FVAL:%.*]], i32 0 -; MAX256-NEXT: [[SHUFFLE12:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <8 x i32> zeroinitializer -; MAX256-NEXT: [[TMP2:%.*]] = fmul <8 x float> [[SHUFFLE11]], [[SHUFFLE12]] -; MAX256-NEXT: [[TMP3:%.*]] = fadd <8 x float> zeroinitializer, [[TMP2]] -; MAX256-NEXT: [[TMP4:%.*]] = insertelement <8 x float> poison, float [[I3]], i32 0 -; MAX256-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float> [[TMP4]], <8 x float> poison, <8 x i32> zeroinitializer -; MAX256-NEXT: [[TMP5:%.*]] = fmul <8 x float> [[SHUFFLE]], [[SHUFFLE12]] -; MAX256-NEXT: [[TMP6:%.*]] = fadd <8 x float> zeroinitializer, [[TMP5]] -; MAX256-NEXT: [[TMP7:%.*]] = insertelement <8 x float> poison, float [[I6]], i32 0 -; MAX256-NEXT: [[SHUFFLE5:%.*]] = shufflevector <8 x float> [[TMP7]], <8 x float> poison, <8 x i32> zeroinitializer -; MAX256-NEXT: [[TMP8:%.*]] = fmul <8 x float> [[SHUFFLE5]], [[SHUFFLE12]] +; MAX256-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <8 x i32> zeroinitializer +; MAX256-NEXT: [[TMP2:%.*]] = insertelement <8 x float> poison, float [[FVAL:%.*]], i32 0 +; MAX256-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> poison, <8 x i32> zeroinitializer +; MAX256-NEXT: [[TMP4:%.*]] = fmul <8 x float> [[TMP1]], [[TMP3]] +; MAX256-NEXT: [[TMP5:%.*]] = fadd <8 x float> zeroinitializer, [[TMP4]] +; MAX256-NEXT: [[TMP6:%.*]] = insertelement <8 x float> poison, float [[I3]], i32 0 +; MAX256-NEXT: [[TMP7:%.*]] = shufflevector <8 x float> [[TMP6]], <8 x float> poison, <8 x i32> zeroinitializer +; MAX256-NEXT: [[TMP8:%.*]] = fmul <8 x float> [[TMP7]], [[TMP3]] ; MAX256-NEXT: [[TMP9:%.*]] = fadd <8 x float> zeroinitializer, [[TMP8]] -; MAX256-NEXT: [[TMP10:%.*]] = insertelement <8 x float> poison, float [[I9]], i32 0 -; MAX256-NEXT: [[SHUFFLE8:%.*]] = shufflevector <8 x float> [[TMP10]], <8 x float> poison, <8 x i32> zeroinitializer -; MAX256-NEXT: [[TMP11:%.*]] = fmul <8 x float> [[SHUFFLE8]], [[SHUFFLE12]] -; MAX256-NEXT: [[TMP12:%.*]] = fadd <8 x float> zeroinitializer, [[TMP11]] +; MAX256-NEXT: [[TMP10:%.*]] = insertelement <8 x float> poison, float [[I6]], i32 0 +; MAX256-NEXT: [[TMP11:%.*]] = shufflevector <8 x float> [[TMP10]], <8 x float> poison, <8 x i32> zeroinitializer +; MAX256-NEXT: [[TMP12:%.*]] = fmul <8 x float> [[TMP11]], [[TMP3]] +; MAX256-NEXT: [[TMP13:%.*]] = fadd <8 x float> zeroinitializer, [[TMP12]] +; MAX256-NEXT: [[TMP14:%.*]] = insertelement <8 x float> poison, float [[I9]], i32 0 +; MAX256-NEXT: [[TMP15:%.*]] = shufflevector <8 x float> [[TMP14]], <8 x float> poison, <8 x i32> zeroinitializer +; MAX256-NEXT: [[TMP16:%.*]] = fmul <8 x float> [[TMP15]], [[TMP3]] +; MAX256-NEXT: [[TMP17:%.*]] = fadd <8 x float> zeroinitializer, [[TMP16]] ; MAX256-NEXT: switch i32 undef, label [[BB5:%.*]] [ ; MAX256-NEXT: i32 0, label [[BB2:%.*]] ; MAX256-NEXT: i32 1, label [[BB3:%.*]] @@ -166,40 +166,28 @@ ; MAX256: bb5: ; MAX256-NEXT: br label [[BB2]] ; MAX256: bb2: -; MAX256-NEXT: [[TMP13:%.*]] = phi <8 x float> [ [[TMP6]], [[BB3]] ], [ [[SHUFFLE12]], [[BB4]] ], [ [[SHUFFLE12]], [[BB5]] ], [ [[SHUFFLE12]], [[BB1]] ] -; MAX256-NEXT: [[TMP14:%.*]] = phi <8 x float> [ [[TMP9]], [[BB3]] ], [ [[SHUFFLE12]], [[BB4]] ], [ [[TMP9]], [[BB5]] ], [ [[TMP9]], [[BB1]] ] -; MAX256-NEXT: [[TMP15:%.*]] = phi <8 x float> [ [[TMP12]], [[BB3]] ], [ [[TMP12]], [[BB4]] ], [ [[SHUFFLE12]], [[BB5]] ], [ [[TMP12]], [[BB1]] ] -; MAX256-NEXT: [[TMP16:%.*]] = phi <8 x float> [ [[TMP3]], [[BB3]] ], [ [[TMP3]], [[BB4]] ], [ [[TMP3]], [[BB5]] ], [ [[SHUFFLE12]], [[BB1]] ] -; MAX256-NEXT: [[TMP17:%.*]] = extractelement <8 x float> [[TMP14]], i32 7 -; MAX256-NEXT: store float [[TMP17]], ptr undef, align 4 +; MAX256-NEXT: [[TMP18:%.*]] = phi <8 x float> [ [[TMP9]], [[BB3]] ], [ [[TMP3]], [[BB4]] ], [ [[TMP3]], [[BB5]] ], [ [[TMP3]], [[BB1]] ] +; MAX256-NEXT: [[TMP19:%.*]] = phi <8 x float> [ [[TMP13]], [[BB3]] ], [ [[TMP3]], [[BB4]] ], [ [[TMP13]], [[BB5]] ], [ [[TMP13]], [[BB1]] ] +; MAX256-NEXT: [[TMP20:%.*]] = phi <8 x float> [ [[TMP17]], [[BB3]] ], [ [[TMP17]], [[BB4]] ], [ [[TMP3]], [[BB5]] ], [ [[TMP17]], [[BB1]] ] +; MAX256-NEXT: [[TMP21:%.*]] = phi <8 x float> [ [[TMP5]], [[BB3]] ], [ [[TMP5]], [[BB4]] ], [ [[TMP5]], [[BB5]] ], [ [[TMP3]], [[BB1]] ] +; MAX256-NEXT: [[TMP22:%.*]] = extractelement <8 x float> [[TMP19]], i32 7 +; MAX256-NEXT: store float [[TMP22]], ptr undef, align 4 ; MAX256-NEXT: ret void ; ; MAX1024-LABEL: @phi_float32( ; MAX1024-NEXT: bb: ; MAX1024-NEXT: br label [[BB1:%.*]] ; MAX1024: bb1: -; MAX1024-NEXT: [[I:%.*]] = fpext half [[HVAL:%.*]] to float -; MAX1024-NEXT: [[I3:%.*]] = fpext half [[HVAL]] to float -; MAX1024-NEXT: [[I6:%.*]] = fpext half [[HVAL]] to float -; MAX1024-NEXT: [[I9:%.*]] = fpext half [[HVAL]] to float -; MAX1024-NEXT: [[TMP0:%.*]] = insertelement <8 x float> poison, float [[I]], i32 0 -; MAX1024-NEXT: [[SHUFFLE11:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <8 x i32> zeroinitializer -; MAX1024-NEXT: [[TMP1:%.*]] = insertelement <8 x float> poison, float [[FVAL:%.*]], i32 0 -; MAX1024-NEXT: [[SHUFFLE12:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <8 x i32> zeroinitializer -; MAX1024-NEXT: [[TMP2:%.*]] = fmul <8 x float> [[SHUFFLE11]], [[SHUFFLE12]] -; MAX1024-NEXT: [[TMP3:%.*]] = fadd <8 x float> zeroinitializer, [[TMP2]] -; MAX1024-NEXT: [[TMP4:%.*]] = insertelement <8 x float> poison, float [[I3]], i32 0 -; MAX1024-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float> [[TMP4]], <8 x float> poison, <8 x i32> zeroinitializer -; MAX1024-NEXT: [[TMP5:%.*]] = fmul <8 x float> [[SHUFFLE]], [[SHUFFLE12]] -; MAX1024-NEXT: [[TMP6:%.*]] = fadd <8 x float> zeroinitializer, [[TMP5]] -; MAX1024-NEXT: [[TMP7:%.*]] = insertelement <8 x float> poison, float [[I6]], i32 0 -; MAX1024-NEXT: [[SHUFFLE5:%.*]] = shufflevector <8 x float> [[TMP7]], <8 x float> poison, <8 x i32> zeroinitializer -; MAX1024-NEXT: [[TMP8:%.*]] = fmul <8 x float> [[SHUFFLE5]], [[SHUFFLE12]] -; MAX1024-NEXT: [[TMP9:%.*]] = fadd <8 x float> zeroinitializer, [[TMP8]] -; MAX1024-NEXT: [[TMP10:%.*]] = insertelement <8 x float> poison, float [[I9]], i32 0 -; MAX1024-NEXT: [[SHUFFLE8:%.*]] = shufflevector <8 x float> [[TMP10]], <8 x float> poison, <8 x i32> zeroinitializer -; MAX1024-NEXT: [[TMP11:%.*]] = fmul <8 x float> [[SHUFFLE8]], [[SHUFFLE12]] -; MAX1024-NEXT: [[TMP12:%.*]] = fadd <8 x float> zeroinitializer, [[TMP11]] +; MAX1024-NEXT: [[TMP0:%.*]] = insertelement <4 x half> poison, half [[HVAL:%.*]], i32 0 +; MAX1024-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[TMP0]], <4 x half> poison, <4 x i32> zeroinitializer +; MAX1024-NEXT: [[TMP2:%.*]] = fpext <4 x half> [[TMP1]] to <4 x float> +; MAX1024-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <32 x i32> +; MAX1024-NEXT: [[TMP4:%.*]] = insertelement <32 x float> poison, float [[FVAL:%.*]], i32 0 +; MAX1024-NEXT: [[TMP5:%.*]] = shufflevector <32 x float> [[TMP4]], <32 x float> poison, <32 x i32> zeroinitializer +; MAX1024-NEXT: [[TMP6:%.*]] = fmul <32 x float> [[TMP3]], [[TMP5]] +; MAX1024-NEXT: [[TMP7:%.*]] = fadd <32 x float> zeroinitializer, [[TMP6]] +; MAX1024-NEXT: [[TMP8:%.*]] = shufflevector <32 x float> [[TMP4]], <32 x float> [[TMP7]], <32 x i32> +; MAX1024-NEXT: [[TMP9:%.*]] = shufflevector <32 x float> [[TMP8]], <32 x float> poison, <32 x i32> ; MAX1024-NEXT: switch i32 undef, label [[BB5:%.*]] [ ; MAX1024-NEXT: i32 0, label [[BB2:%.*]] ; MAX1024-NEXT: i32 1, label [[BB3:%.*]] @@ -208,16 +196,15 @@ ; MAX1024: bb3: ; MAX1024-NEXT: br label [[BB2]] ; MAX1024: bb4: +; MAX1024-NEXT: [[TMP10:%.*]] = shufflevector <32 x float> [[TMP8]], <32 x float> [[TMP7]], <32 x i32> ; MAX1024-NEXT: br label [[BB2]] ; MAX1024: bb5: +; MAX1024-NEXT: [[TMP11:%.*]] = shufflevector <32 x float> [[TMP8]], <32 x float> [[TMP7]], <32 x i32> ; MAX1024-NEXT: br label [[BB2]] ; MAX1024: bb2: -; MAX1024-NEXT: [[TMP13:%.*]] = phi <8 x float> [ [[TMP6]], [[BB3]] ], [ [[SHUFFLE12]], [[BB4]] ], [ [[SHUFFLE12]], [[BB5]] ], [ [[SHUFFLE12]], [[BB1]] ] -; MAX1024-NEXT: [[TMP14:%.*]] = phi <8 x float> [ [[TMP9]], [[BB3]] ], [ [[SHUFFLE12]], [[BB4]] ], [ [[TMP9]], [[BB5]] ], [ [[TMP9]], [[BB1]] ] -; MAX1024-NEXT: [[TMP15:%.*]] = phi <8 x float> [ [[TMP12]], [[BB3]] ], [ [[TMP12]], [[BB4]] ], [ [[SHUFFLE12]], [[BB5]] ], [ [[TMP12]], [[BB1]] ] -; MAX1024-NEXT: [[TMP16:%.*]] = phi <8 x float> [ [[TMP3]], [[BB3]] ], [ [[TMP3]], [[BB4]] ], [ [[TMP3]], [[BB5]] ], [ [[SHUFFLE12]], [[BB1]] ] -; MAX1024-NEXT: [[TMP17:%.*]] = extractelement <8 x float> [[TMP14]], i32 7 -; MAX1024-NEXT: store float [[TMP17]], ptr undef, align 4 +; MAX1024-NEXT: [[TMP12:%.*]] = phi <32 x float> [ [[TMP7]], [[BB3]] ], [ [[TMP10]], [[BB4]] ], [ [[TMP11]], [[BB5]] ], [ [[TMP9]], [[BB1]] ] +; MAX1024-NEXT: [[TMP13:%.*]] = extractelement <32 x float> [[TMP12]], i32 15 +; MAX1024-NEXT: store float [[TMP13]], ptr undef, align 4 ; MAX1024-NEXT: ret void ; bb: