diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -126,6 +126,10 @@ cl::desc( "Attempt to vectorize horizontal reductions feeding into a store")); +static cl::opt AllowSameScalarsReduction( + "slp-optimize-identity-hor-reduction-ops", cl::init(true), cl::Hidden, + cl::desc("Allow horizontal reduction of same scalars.")); + static cl::opt MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits")); @@ -12407,14 +12411,20 @@ // If there are a sufficient number of reduction values, reduce // to a nearby power-of-2. We can safely generate oversized // vectors and rely on the backend to split them to legal sizes. - size_t NumReducedVals = + unsigned NumReducedVals = std::accumulate(ReducedVals.begin(), ReducedVals.end(), 0, [](size_t Num, ArrayRef Vals) { if (!isGoodForReduction(Vals)) return Num; return Num + Vals.size(); }); - if (NumReducedVals < ReductionLimit) { + if (NumReducedVals < ReductionLimit && + (!AllowSameScalarsReduction || + all_of(ReducedVals, [](ArrayRef RedV) { + return RedV.size() < 2 || + !all_of(RedV, [&](Value *V) { return isConstant(V); }) || + !isSplat(RedV); + }))) { for (ReductionOpsType &RdxOps : ReductionOps) for (Value *RdxOp : RdxOps) V.analyzedReductionRoot(cast(RdxOp)); @@ -12519,10 +12529,78 @@ } } } + + // Emit code for constant values. + if (AllowSameScalarsReduction && Candidates.size() > 1 && + all_of(Candidates, isConstant)) { + Value *Res = Candidates.front(); + ++VectorizedVals.try_emplace(Candidates.front(), 0).first->getSecond(); + for (Value *V : ArrayRef(Candidates).drop_front()) { + Res = createOp(Builder, RdxKind, Res, V, "const.rdx", ReductionOps); + ++VectorizedVals.try_emplace(V, 0).first->getSecond(); + } + if (!VectorizedTree) { + // Initialize the final value in the reduction. + VectorizedTree = Res; + } else { + // Update the final value in the reduction. + Builder.SetCurrentDebugLocation( + cast(ReductionOps.front().front())->getDebugLoc()); + VectorizedTree = createOp(Builder, RdxKind, VectorizedTree, Res, + "op.rdx", ReductionOps); + } + continue; + } + unsigned NumReducedVals = Candidates.size(); - if (NumReducedVals < ReductionLimit) + if (NumReducedVals < ReductionLimit && + (NumReducedVals < 2 || !AllowSameScalarsReduction || + !isSplat(Candidates))) continue; + // Gather same values. + MapVector SameValuesCounter; + for (Value *V : Candidates) + ++SameValuesCounter.insert(std::make_pair(V, 0)).first->second; + // Check if we have repeated values. + bool IsSupportedReusedRdxOp = + AllowSameScalarsReduction && RdxKind != RecurKind::Mul && + RdxKind != RecurKind::FMul && RdxKind != RecurKind::FMulAdd; + bool SameScaleFactor = false; + bool HasReusedScalars = SameValuesCounter.size() != Candidates.size(); + if (IsSupportedReusedRdxOp && HasReusedScalars) { + SameScaleFactor = + (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd || + RdxKind == RecurKind::Xor) && + all_of(SameValuesCounter, + [&SameValuesCounter](const std::pair &P) { + return P.second == SameValuesCounter.front().second; + }); + Candidates.resize(SameValuesCounter.size()); + transform(SameValuesCounter, Candidates.begin(), + [](const auto &P) { return P.first; }); + NumReducedVals = Candidates.size(); + // Have a reduction of the same element. + if (NumReducedVals == 1) { + Value *RedVal = + emitReusedOps(Candidates.front(), Builder, std::nullopt, + SameValuesCounter, TrackedToOrig); + if (!VectorizedTree) { + // Initialize the final value in the reduction. + VectorizedTree = RedVal; + } else { + // Update the final value in the reduction. + Builder.SetCurrentDebugLocation( + cast(ReductionOps.front().front())->getDebugLoc()); + VectorizedTree = createOp(Builder, RdxKind, VectorizedTree, RedVal, + "op.rdx", ReductionOps); + } + Value *OrigV = TrackedToOrig.find(Candidates.front())->second; + VectorizedVals.try_emplace(OrigV, SameValuesCounter[OrigV]); + continue; + } + } + unsigned MaxVecRegSize = V.getMaxVecRegSize(); unsigned EltSize = V.getVectorElementSize(Candidates[0]); unsigned MaxElts = RegMaxNumber * PowerOf2Floor(MaxVecRegSize / EltSize); @@ -12551,6 +12629,7 @@ ReduxWidth /= 2; return IsAnyRedOpGathered; }; + bool AnyVectorized = false; while (Pos < NumReducedVals - ReduxWidth + 1 && ReduxWidth >= ReductionLimit) { // Dependency in tree of the reduction ops - drop this attempt, try @@ -12603,15 +12682,19 @@ LocalExternallyUsedValues[TrackedVals[V]]; }); } - // Number of uses of the candidates in the vector of values. - SmallDenseMap NumUses(Candidates.size()); - for (unsigned Cnt = 0; Cnt < Pos; ++Cnt) { - Value *V = Candidates[Cnt]; - ++NumUses.try_emplace(V, 0).first->getSecond(); - } - for (unsigned Cnt = Pos + ReduxWidth; Cnt < NumReducedVals; ++Cnt) { - Value *V = Candidates[Cnt]; - ++NumUses.try_emplace(V, 0).first->getSecond(); + if (!IsSupportedReusedRdxOp && HasReusedScalars) { + // Number of uses of the candidates in the vector of values. + SameValuesCounter.clear(); + for (unsigned Cnt = 0; Cnt < Pos; ++Cnt) { + Value *V = Candidates[Cnt]; + Value *OrigV = TrackedToOrig.find(V)->second; + ++SameValuesCounter[OrigV]; + } + for (unsigned Cnt = Pos + ReduxWidth; Cnt < NumReducedVals; ++Cnt) { + Value *V = Candidates[Cnt]; + Value *OrigV = TrackedToOrig.find(V)->second; + ++SameValuesCounter[OrigV]; + } } SmallPtrSet VLScalars(VL.begin(), VL.end()); // Gather externally used values. @@ -12626,7 +12709,9 @@ LocalExternallyUsedValues[RdxVal]; continue; } - unsigned NumOps = VectorizedVals.lookup(RdxVal) + NumUses[RdxVal]; + Value *OrigV = TrackedToOrig.find(RdxVal)->second; + unsigned NumOps = + VectorizedVals.lookup(RdxVal) + SameValuesCounter[OrigV]; if (NumOps != ReducedValsToOps.find(RdxVal)->second.size()) LocalExternallyUsedValues[RdxVal]; } @@ -12640,7 +12725,9 @@ LocalExternallyUsedValues[RdxVal]; continue; } - unsigned NumOps = VectorizedVals.lookup(RdxVal) + NumUses[RdxVal]; + Value *OrigV = TrackedToOrig.find(RdxVal)->second; + unsigned NumOps = + VectorizedVals.lookup(RdxVal) + SameValuesCounter[OrigV]; if (NumOps != ReducedValsToOps.find(RdxVal)->second.size()) LocalExternallyUsedValues[RdxVal]; } @@ -12727,9 +12814,28 @@ if (isBoolLogicOp(RdxRootInst)) VectorizedRoot = Builder.CreateFreeze(VectorizedRoot); + // Emit code to correctly handle reused reduced values, if required. + if (IsSupportedReusedRdxOp && HasReusedScalars && !SameScaleFactor) { + VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, VL, + SameValuesCounter, TrackedToOrig); + } + Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI); + // Improved analysis for add/fadd/xor reductions with same scale factor + // for all operands of reductions. We can emit scalar ops for them + // instead. + if (IsSupportedReusedRdxOp && HasReusedScalars && SameScaleFactor) { + MapVector RedCounter; + RedCounter.insert( + std::make_pair(ReducedSubTree, SameValuesCounter.front().second)); + DenseMap RedToOrig( + {{ReducedSubTree, ReducedSubTree}}); + ReducedSubTree = emitReusedOps(ReducedSubTree, Builder, std::nullopt, + RedCounter, RedToOrig); + } + if (!VectorizedTree) { // Initialize the final value in the reduction. VectorizedTree = ReducedSubTree; @@ -12742,14 +12848,38 @@ } // Count vectorized reduced values to exclude them from final reduction. for (Value *RdxVal : VL) { - ++VectorizedVals.try_emplace(TrackedToOrig.find(RdxVal)->second, 0) - .first->getSecond(); + Value *OrigV = TrackedToOrig.find(RdxVal)->second; + if (IsSupportedReusedRdxOp) { + VectorizedVals.try_emplace(OrigV, SameValuesCounter[OrigV]); + continue; + } + ++VectorizedVals.try_emplace(OrigV, 0).first->getSecond(); if (!V.isVectorized(RdxVal)) RequiredExtract.insert(RdxVal); } Pos += ReduxWidth; Start = Pos; ReduxWidth = PowerOf2Floor(NumReducedVals - Pos); + AnyVectorized = true; + } + if (IsSupportedReusedRdxOp && HasReusedScalars && !AnyVectorized) { + for (const std::pair &P : SameValuesCounter) { + Value *RedVal = emitReusedOps(P.first, Builder, std::nullopt, + SameValuesCounter, TrackedToOrig); + if (!VectorizedTree) { + // Initialize the final value in the reduction. + VectorizedTree = RedVal; + } else { + // Update the final value in the reduction. + Builder.SetCurrentDebugLocation( + cast(ReductionOps.front().front())->getDebugLoc()); + VectorizedTree = createOp(Builder, RdxKind, VectorizedTree, RedVal, + "op.rdx", ReductionOps); + } + Value *OrigV = TrackedToOrig.find(P.first)->second; + VectorizedVals.try_emplace(OrigV, P.second); + } + continue; } } if (VectorizedTree) { @@ -12977,6 +13107,112 @@ ++NumVectorInstructions; return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind); } + + Value *emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder, + ArrayRef VL, + const MapVector &SameValuesCounter, + const DenseMap &TrackedToOrig) { + switch (RdxKind) { + case RecurKind::Add: { + // root = mul prev_root, <1, 1, n, 1> + if (VL.empty()) { + unsigned Cnt = SameValuesCounter.lookup( + TrackedToOrig.find(VectorizedValue)->second); + Value *Scale = ConstantInt::get(VectorizedValue->getType(), Cnt); + LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of " + << VectorizedValue << ". (HorRdx)\n"); + return Builder.CreateMul(VectorizedValue, Scale); + } + SmallVector Vals; + for (Value *V : VL) { + unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second); + Vals.push_back(ConstantInt::get(V->getType(), Cnt, /*IsSigned=*/false)); + } + auto *Scale = ConstantVector::get(Vals); + LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of " + << VectorizedValue << ". (HorRdx)\n"); + return Builder.CreateMul(VectorizedValue, Scale); + } + case RecurKind::And: + case RecurKind::Or: + // No need for multiple or/and(s). + LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue + << ". (HorRdx)\n"); + return VectorizedValue; + case RecurKind::SMax: + case RecurKind::SMin: + case RecurKind::UMax: + case RecurKind::UMin: + case RecurKind::FMax: + case RecurKind::FMin: + // No need for multiple min/max(s) of the same value. + LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue + << ". (HorRdx)\n"); + return VectorizedValue; + case RecurKind::Xor: { + // Replace values with even number of repeats with 0, since + // x xor x = 0. + // root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6, + // 7>, if elements 4th and 6th elements have even number of repeats. + if (VL.empty()) { + unsigned Cnt = SameValuesCounter.lookup( + TrackedToOrig.find(VectorizedValue)->second); + LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " + << VectorizedValue << ". (HorRdx)\n"); + if (Cnt % 2 == 0) + return Constant::getNullValue(VectorizedValue->getType()); + return VectorizedValue; + } + SmallVector Mask( + cast(VectorizedValue->getType())->getNumElements(), + UndefMaskElem); + std::iota(Mask.begin(), Mask.end(), 0); + bool NeedShuffle = false; + for (unsigned I = 0, VF = VL.size(); I < VF; ++I) { + Value *V = VL[I]; + unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second); + if (Cnt % 2 == 0) { + Mask[I] = VF; + NeedShuffle = true; + } + } + LLVM_DEBUG(dbgs() << "SLP: Xor <"; + for (int I : Mask) + dbgs() << I << " "; + dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n"); + if (NeedShuffle) + VectorizedValue = Builder.CreateShuffleVector( + VectorizedValue, + ConstantVector::getNullValue(VectorizedValue->getType()), Mask); + return VectorizedValue; + } + case RecurKind::FAdd: { + // root = fmul prev_root, <1.0, 1.0, n.0, 1.0> + if (VL.empty()) { + unsigned Cnt = SameValuesCounter.lookup( + TrackedToOrig.find(VectorizedValue)->second); + Value *Scale = ConstantFP::get(VectorizedValue->getType(), Cnt); + LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of " + << VectorizedValue << ". (HorRdx)\n"); + return Builder.CreateFMul(VectorizedValue, Scale); + } + SmallVector Vals; + for (Value *V : VL) { + unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second); + Vals.push_back(ConstantFP::get(V->getType(), Cnt)); + } + auto *Scale = ConstantVector::get(Vals); + return Builder.CreateFMul(VectorizedValue, Scale); + } + case RecurKind::Mul: + case RecurKind::FMul: + case RecurKind::FMulAdd: + case RecurKind::SelectICmp: + case RecurKind::SelectFCmp: + case RecurKind::None: + llvm_unreachable("Unexpected reduction kind for reused scalars."); + } + } }; } // end anonymous namespace diff --git a/llvm/test/Transforms/PhaseOrdering/X86/scalarization-inseltpoison.ll b/llvm/test/Transforms/PhaseOrdering/X86/scalarization-inseltpoison.ll --- a/llvm/test/Transforms/PhaseOrdering/X86/scalarization-inseltpoison.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/scalarization-inseltpoison.ll @@ -18,16 +18,15 @@ ; CHECK-NEXT: [[MUL13:%.*]] = mul nsw i32 [[W:%.*]], 53 ; CHECK-NEXT: [[DIV17:%.*]] = sdiv i32 [[X:%.*]], 820 ; CHECK-NEXT: [[MUL21:%.*]] = shl nsw i32 [[U:%.*]], 2 -; CHECK-NEXT: [[DOTSCALAR:%.*]] = add i32 [[Y:%.*]], 1 -; CHECK-NEXT: [[DOTSCALAR1:%.*]] = add i32 [[DOTSCALAR]], [[DIV17]] -; CHECK-NEXT: [[DOTSCALAR2:%.*]] = add i32 [[DOTSCALAR1]], [[MUL5]] -; CHECK-NEXT: [[DOTSCALAR3:%.*]] = add i32 [[DOTSCALAR2]], [[DIV]] -; CHECK-NEXT: [[DOTSCALAR4:%.*]] = add i32 [[DOTSCALAR3]], [[MUL13]] -; CHECK-NEXT: [[DOTSCALAR5:%.*]] = add i32 [[DOTSCALAR4]], [[MUL]] -; CHECK-NEXT: [[DOTSCALAR6:%.*]] = add i32 [[DOTSCALAR5]], [[DIV9]] -; CHECK-NEXT: [[DOTSCALAR7:%.*]] = add i32 [[DOTSCALAR6]], [[MUL21]] -; CHECK-NEXT: [[DOTSCALAR8:%.*]] = add i32 [[DOTSCALAR7]], 317425 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[DOTSCALAR8]], i64 0 +; CHECK-NEXT: [[OP_RDX:%.*]] = add nsw i32 [[DIV17]], 317426 +; CHECK-NEXT: [[OP_RDX9:%.*]] = add nsw i32 [[DIV]], [[DIV9]] +; CHECK-NEXT: [[OP_RDX10:%.*]] = add i32 [[MUL5]], [[MUL13]] +; CHECK-NEXT: [[OP_RDX11:%.*]] = add i32 [[MUL]], [[MUL21]] +; CHECK-NEXT: [[OP_RDX12:%.*]] = add i32 [[OP_RDX]], [[OP_RDX9]] +; CHECK-NEXT: [[OP_RDX13:%.*]] = add i32 [[OP_RDX10]], [[OP_RDX11]] +; CHECK-NEXT: [[OP_RDX14:%.*]] = add i32 [[OP_RDX12]], [[OP_RDX13]] +; CHECK-NEXT: [[OP_RDX15:%.*]] = add i32 [[OP_RDX14]], [[Y:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[OP_RDX15]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[ADD29:%.*]] = add <4 x i32> [[TMP2]], [[NUM:%.*]] ; CHECK-NEXT: ret <4 x i32> [[ADD29]] diff --git a/llvm/test/Transforms/PhaseOrdering/X86/scalarization.ll b/llvm/test/Transforms/PhaseOrdering/X86/scalarization.ll --- a/llvm/test/Transforms/PhaseOrdering/X86/scalarization.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/scalarization.ll @@ -18,16 +18,15 @@ ; CHECK-NEXT: [[MUL13:%.*]] = mul nsw i32 [[W:%.*]], 53 ; CHECK-NEXT: [[DIV17:%.*]] = sdiv i32 [[X:%.*]], 820 ; CHECK-NEXT: [[MUL21:%.*]] = shl nsw i32 [[U:%.*]], 2 -; CHECK-NEXT: [[DOTSCALAR:%.*]] = add i32 [[Y:%.*]], 1 -; CHECK-NEXT: [[DOTSCALAR1:%.*]] = add i32 [[DOTSCALAR]], [[DIV17]] -; CHECK-NEXT: [[DOTSCALAR2:%.*]] = add i32 [[DOTSCALAR1]], [[MUL5]] -; CHECK-NEXT: [[DOTSCALAR3:%.*]] = add i32 [[DOTSCALAR2]], [[DIV]] -; CHECK-NEXT: [[DOTSCALAR4:%.*]] = add i32 [[DOTSCALAR3]], [[MUL13]] -; CHECK-NEXT: [[DOTSCALAR5:%.*]] = add i32 [[DOTSCALAR4]], [[MUL]] -; CHECK-NEXT: [[DOTSCALAR6:%.*]] = add i32 [[DOTSCALAR5]], [[DIV9]] -; CHECK-NEXT: [[DOTSCALAR7:%.*]] = add i32 [[DOTSCALAR6]], [[MUL21]] -; CHECK-NEXT: [[DOTSCALAR8:%.*]] = add i32 [[DOTSCALAR7]], 317425 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> , i32 [[DOTSCALAR8]], i64 0 +; CHECK-NEXT: [[OP_RDX:%.*]] = add nsw i32 [[DIV17]], 317426 +; CHECK-NEXT: [[OP_RDX9:%.*]] = add nsw i32 [[DIV]], [[DIV9]] +; CHECK-NEXT: [[OP_RDX10:%.*]] = add i32 [[MUL5]], [[MUL13]] +; CHECK-NEXT: [[OP_RDX11:%.*]] = add i32 [[MUL]], [[MUL21]] +; CHECK-NEXT: [[OP_RDX12:%.*]] = add i32 [[OP_RDX]], [[OP_RDX9]] +; CHECK-NEXT: [[OP_RDX13:%.*]] = add i32 [[OP_RDX10]], [[OP_RDX11]] +; CHECK-NEXT: [[OP_RDX14:%.*]] = add i32 [[OP_RDX12]], [[OP_RDX13]] +; CHECK-NEXT: [[OP_RDX15:%.*]] = add i32 [[OP_RDX14]], [[Y:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> , i32 [[OP_RDX15]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[ADD29:%.*]] = add <4 x i32> [[TMP2]], [[NUM:%.*]] ; CHECK-NEXT: ret <4 x i32> [[ADD29]] diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/buildvector-reduce.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/buildvector-reduce.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/buildvector-reduce.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/buildvector-reduce.ll @@ -1,18 +1,28 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -passes=slp-vectorizer < %s -mtriple=arm64-apple-macosx | FileCheck %s +; RUN: opt -S -passes=slp-vectorizer < %s -mtriple=arm64-apple-macosx -slp-optimize-identity-hor-reduction-ops=false | FileCheck %s --check-prefix=NO-IDENTITY define i8 @test() { ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[SUM:%.*]] = phi i32 [ [[TMP1:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[SUM:%.*]] = phi i32 [ [[TMP0:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: [[CALL278:%.*]] = call i32 @fn(i32 [[SUM]]) -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> poison, i32 [[CALL278]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP1]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[SHUFFLE]]) +; CHECK-NEXT: [[TMP0]] = mul i32 [[CALL278]], 8 ; CHECK-NEXT: br label [[FOR_BODY]] ; +; NO-IDENTITY-LABEL: @test( +; NO-IDENTITY-NEXT: entry: +; NO-IDENTITY-NEXT: br label [[FOR_BODY:%.*]] +; NO-IDENTITY: for.body: +; NO-IDENTITY-NEXT: [[SUM:%.*]] = phi i32 [ [[TMP2:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; NO-IDENTITY-NEXT: [[CALL278:%.*]] = call i32 @fn(i32 [[SUM]]) +; NO-IDENTITY-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> poison, i32 [[CALL278]], i32 0 +; NO-IDENTITY-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <8 x i32> zeroinitializer +; NO-IDENTITY-NEXT: [[TMP2]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP1]]) +; NO-IDENTITY-NEXT: br label [[FOR_BODY]] +; entry: br label %for.body diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll @@ -5,56 +5,36 @@ define void @Test(i32) { ; CHECK-LABEL: @Test( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[TMP0:%.*]], i32 0 -; CHECK-NEXT: [[SHUFFLE8:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x i32> poison, i32 [[TMP0]], i32 0 -; CHECK-NEXT: [[SHUFFLE7:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[TMP3:%.*]] = phi <2 x i32> [ [[TMP13:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[SHUFFLE]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = add <8 x i32> [[SHUFFLE]], -; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[SHUFFLE7]]) -; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[SHUFFLE8]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP5]]) -; CHECK-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[TMP8]] -; CHECK-NEXT: [[OP_RDX2:%.*]] = and i32 [[OP_RDX1]], [[TMP0]] -; CHECK-NEXT: [[OP_RDX3:%.*]] = and i32 [[TMP0]], [[TMP0]] -; CHECK-NEXT: [[OP_RDX4:%.*]] = and i32 [[OP_RDX2]], [[OP_RDX3]] -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> , i32 [[OP_RDX4]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i32> poison, i32 [[TMP4]], i32 0 -; CHECK-NEXT: [[SHUFFLE6:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = and <2 x i32> [[TMP9]], [[SHUFFLE6]] -; CHECK-NEXT: [[TMP12:%.*]] = add <2 x i32> [[TMP9]], [[SHUFFLE6]] -; CHECK-NEXT: [[TMP13]] = shufflevector <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP11:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <8 x i32> [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = add <8 x i32> [[TMP2]], +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP4]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP0:%.*]], [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> , i32 [[OP_RDX]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = and <2 x i32> [[TMP6]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = add <2 x i32> [[TMP6]], [[TMP8]] +; CHECK-NEXT: [[TMP11]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> ; CHECK-NEXT: br label [[LOOP]] ; ; FORCE_REDUCTION-LABEL: @Test( ; FORCE_REDUCTION-NEXT: entry: -; FORCE_REDUCTION-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[TMP0:%.*]], i32 0 -; FORCE_REDUCTION-NEXT: [[SHUFFLE7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer -; FORCE_REDUCTION-NEXT: [[TMP2:%.*]] = insertelement <16 x i32> poison, i32 [[TMP0]], i32 0 -; FORCE_REDUCTION-NEXT: [[SHUFFLE6:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> poison, <16 x i32> zeroinitializer ; FORCE_REDUCTION-NEXT: br label [[LOOP:%.*]] ; FORCE_REDUCTION: loop: -; FORCE_REDUCTION-NEXT: [[TMP3:%.*]] = phi <2 x i32> [ [[TMP10:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ] -; FORCE_REDUCTION-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <8 x i32> -; FORCE_REDUCTION-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[SHUFFLE]], i32 1 -; FORCE_REDUCTION-NEXT: [[TMP5:%.*]] = add <8 x i32> [[SHUFFLE]], -; FORCE_REDUCTION-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[SHUFFLE6]]) -; FORCE_REDUCTION-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[SHUFFLE7]]) -; FORCE_REDUCTION-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP6]], [[TMP7]] -; FORCE_REDUCTION-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP5]]) -; FORCE_REDUCTION-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[TMP8]] -; FORCE_REDUCTION-NEXT: [[OP_RDX2:%.*]] = and i32 [[OP_RDX1]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_RDX3:%.*]] = and i32 [[TMP0]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_RDX4:%.*]] = and i32 [[OP_RDX2]], [[OP_RDX3]] -; FORCE_REDUCTION-NEXT: [[OP_RDX5:%.*]] = and i32 [[OP_RDX4]], [[TMP4]] -; FORCE_REDUCTION-NEXT: [[VAL_43:%.*]] = add i32 [[TMP4]], 14910 -; FORCE_REDUCTION-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[OP_RDX5]], i32 0 -; FORCE_REDUCTION-NEXT: [[TMP10]] = insertelement <2 x i32> [[TMP9]], i32 [[VAL_43]], i32 1 +; FORCE_REDUCTION-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP7:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ] +; FORCE_REDUCTION-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> +; FORCE_REDUCTION-NEXT: [[TMP3:%.*]] = extractelement <8 x i32> [[TMP2]], i32 1 +; FORCE_REDUCTION-NEXT: [[TMP4:%.*]] = add <8 x i32> [[TMP2]], +; FORCE_REDUCTION-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP4]]) +; FORCE_REDUCTION-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP0:%.*]], [[TMP5]] +; FORCE_REDUCTION-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[TMP3]] +; FORCE_REDUCTION-NEXT: [[VAL_43:%.*]] = add i32 [[TMP3]], 14910 +; FORCE_REDUCTION-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[OP_RDX1]], i32 0 +; FORCE_REDUCTION-NEXT: [[TMP7]] = insertelement <2 x i32> [[TMP6]], i32 [[VAL_43]], i32 1 ; FORCE_REDUCTION-NEXT: br label [[LOOP]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/buildvector_splat_extractvalue.ll b/llvm/test/Transforms/SLPVectorizer/X86/buildvector_splat_extractvalue.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/buildvector_splat_extractvalue.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/buildvector_splat_extractvalue.ll @@ -5,14 +5,8 @@ ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[DOTOBIT1683:%.*]] = extractvalue { i64, i1 } zeroinitializer, 1 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i1> poison, i1 [[DOTOBIT1683]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i1> [[TMP0]], <8 x i1> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> zeroinitializer) -; CHECK-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP1]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = or i1 [[TMP2]], [[TMP3]] -; CHECK-NEXT: [[OP_RDX1:%.*]] = or i1 [[OP_RDX]], false -; CHECK-NEXT: [[OP_RDX2:%.*]] = or i1 [[OP_RDX1]], false -; CHECK-NEXT: br i1 [[OP_RDX2]], label [[EXIT1:%.*]], label [[EXIT2:%.*]] +; CHECK-NEXT: [[OP_RDX:%.*]] = or i1 false, [[DOTOBIT1683]] +; CHECK-NEXT: br i1 [[OP_RDX]], label [[EXIT1:%.*]], label [[EXIT2:%.*]] ; CHECK: exit2: ; CHECK-NEXT: ret float 0.000000e+00 ; CHECK: exit1: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll @@ -14,13 +14,11 @@ ; CHECK-NEXT: [[OR1:%.*]] = or i64 undef, undef ; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i64 undef, [[OR1]] ; CHECK-NEXT: [[ADD9:%.*]] = select i1 [[CMP3]], i32 65536, i32 65537 -; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef) -; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP0]], undef -; CHECK-NEXT: [[OP_RDX1:%.*]] = add i32 [[ADD0]], [[ADD2]] -; CHECK-NEXT: [[OP_RDX2:%.*]] = add i32 [[ADD4]], [[ADD9]] -; CHECK-NEXT: [[OP_RDX3:%.*]] = add i32 [[OP_RDX]], [[OP_RDX1]] -; CHECK-NEXT: [[OP_RDX4:%.*]] = add i32 [[OP_RDX3]], [[OP_RDX2]] -; CHECK-NEXT: ret i32 [[OP_RDX4]] +; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 undef, [[ADD0]] +; CHECK-NEXT: [[OP_RDX1:%.*]] = add i32 [[ADD2]], [[ADD4]] +; CHECK-NEXT: [[OP_RDX2:%.*]] = add i32 [[OP_RDX]], [[OP_RDX1]] +; CHECK-NEXT: [[OP_RDX3:%.*]] = add i32 [[OP_RDX2]], [[ADD9]] +; CHECK-NEXT: ret i32 [[OP_RDX3]] ; entry: %or0 = or i64 undef, undef diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll @@ -6,16 +6,18 @@ ; CHECK-NEXT: bb: ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> , i32 [[TMP7:%.*]], i32 2 ; CHECK-NEXT: [[TMP1:%.*]] = sub <4 x i32> [[TMP0]], zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 undef, i32 6 -; CHECK-NEXT: [[TMP4:%.*]] = sub nsw <8 x i32> , [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = add nsw <8 x i32> , [[TMP3]] -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP5]], <8 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = add <8 x i32> zeroinitializer, [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i32> [[TMP7]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP8]]) -; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> zeroinitializer) -; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP24:%.*]] = sub i32 undef, 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> , i32 [[TMP24]], i32 4 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 0, i32 5 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[TMP24]], i32 6 +; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <8 x i32> [[TMP3]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = add nsw <8 x i32> [[TMP3]], [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> [[TMP7]], <8 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = add <8 x i32> zeroinitializer, [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i32> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP10]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP11]], 0 ; CHECK-NEXT: [[TMP64:%.*]] = zext i32 [[OP_RDX]] to i64 ; CHECK-NEXT: ret i64 [[TMP64]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/float-min-max.ll b/llvm/test/Transforms/SLPVectorizer/X86/float-min-max.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/float-min-max.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/float-min-max.ll @@ -14,12 +14,11 @@ ; CHECK-NEXT: [[UMIN33:%.*]] = select i1 [[L1]], ptr [[SCEVGEP31]], ptr [[P2]] ; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt ptr [[P3:%.*]], [[UMIN]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[C:%.*]] -; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[C]] ; CHECK-NEXT: [[BOUND042:%.*]] = icmp ugt ptr [[P3]], [[UMIN33]] ; CHECK-NEXT: [[FOUND_CONFLICT44:%.*]] = and i1 [[BOUND042]], [[C]] -; CHECK-NEXT: [[CONFLICT_RDX45:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT44]] -; CHECK-NEXT: [[CONFLICT_RDX49:%.*]] = or i1 [[CONFLICT_RDX45]], [[C]] -; CHECK-NEXT: ret i1 [[CONFLICT_RDX49]] +; CHECK-NEXT: [[OP_RDX:%.*]] = or i1 [[C]], [[FOUND_CONFLICT]] +; CHECK-NEXT: [[OP_RDX1:%.*]] = or i1 [[OP_RDX]], [[FOUND_CONFLICT44]] +; CHECK-NEXT: ret i1 [[OP_RDX1]] ; %l0 = icmp ult ptr %p2, %p1 %umin = select i1 %l0, ptr %p2, ptr %p1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll b/llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll @@ -13,30 +13,25 @@ ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 3 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[TMP3]], i32 1 -; CHECK-NEXT: [[SHUFFLE15:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[SHUFFLE15]]) -; CHECK-NEXT: [[OP_RDX16:%.*]] = add i32 [[TMP6]], 0 -; CHECK-NEXT: [[OP_RDX17:%.*]] = add i32 [[OP_RDX16]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = mul <2 x i32> [[TMP5]], +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i32> [[TMP6]], i32 1 +; CHECK-NEXT: [[OP_RDX11:%.*]] = add i32 [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[OP_RDX12:%.*]] = add i32 [[OP_RDX11]], 0 ; CHECK-NEXT: br label [[BB3:%.*]] ; CHECK: bb2: ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: -; CHECK-NEXT: [[P1:%.*]] = phi i32 [ [[OP_RDX17]], [[BB1]] ], [ 0, [[BB2:%.*]] ] +; CHECK-NEXT: [[P1:%.*]] = phi i32 [ [[OP_RDX12]], [[BB1]] ], [ 0, [[BB2:%.*]] ] ; CHECK-NEXT: ret i32 0 ; CHECK: bb4: -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 0 -; CHECK-NEXT: [[SHUFFLE10:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i32> [[SHUFFLE]], [[SHUFFLE10]] -; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> zeroinitializer) -; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP8]]) -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i32> poison, i32 [[TMP10]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x i32> [[TMP11]], i32 [[TMP9]], i32 1 -; CHECK-NEXT: [[TMP13:%.*]] = add <2 x i32> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i32> [[TMP13]], i32 0 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i32> [[TMP13]], i32 1 -; CHECK-NEXT: [[OP_RDX13:%.*]] = add i32 [[TMP14]], [[TMP15]] -; CHECK-NEXT: [[OP_RDX14:%.*]] = add i32 [[OP_RDX13]], [[TMP2]] -; CHECK-NEXT: ret i32 [[OP_RDX14]] +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 0 +; CHECK-NEXT: [[SHUFFLE8:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = add <4 x i32> [[SHUFFLE]], [[SHUFFLE8]] +; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP10]]) +; CHECK-NEXT: [[OP_RDX9:%.*]] = add i32 [[TMP11]], 0 +; CHECK-NEXT: [[OP_RDX10:%.*]] = add i32 [[OP_RDX9]], [[TMP2]] +; CHECK-NEXT: ret i32 [[OP_RDX10]] ; CHECK: bb5: ; CHECK-NEXT: br label [[BB4:%.*]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll @@ -16,12 +16,12 @@ ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr @arr, align 16 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr @arr1, align 16 ; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[SHUFFLE]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[CONV]] -; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONV]] -; CHECK-NEXT: store float [[OP_RDX1]], ptr @res, align 4 -; CHECK-NEXT: ret float [[OP_RDX1]] +; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP4]], 2.000000e+00 +; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[CONV]], 2.000000e+00 +; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP5]], [[TMP6]] +; CHECK-NEXT: store float [[OP_RDX]], ptr @res, align 4 +; CHECK-NEXT: ret float [[OP_RDX]] ; ; THRESHOLD-LABEL: @baz( ; THRESHOLD-NEXT: entry: @@ -31,12 +31,15 @@ ; THRESHOLD-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr @arr, align 16 ; THRESHOLD-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr @arr1, align 16 ; THRESHOLD-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]] -; THRESHOLD-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> -; THRESHOLD-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[SHUFFLE]]) -; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[CONV]] -; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONV]] -; THRESHOLD-NEXT: store float [[OP_RDX1]], ptr @res, align 4 -; THRESHOLD-NEXT: ret float [[OP_RDX1]] +; THRESHOLD-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP3]]) +; THRESHOLD-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP4]], i32 0 +; THRESHOLD-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[CONV]], i32 1 +; THRESHOLD-NEXT: [[TMP7:%.*]] = fmul fast <2 x float> [[TMP6]], +; THRESHOLD-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP7]], i32 0 +; THRESHOLD-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP7]], i32 1 +; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP8]], [[TMP9]] +; THRESHOLD-NEXT: store float [[OP_RDX]], ptr @res, align 4 +; THRESHOLD-NEXT: ret float [[OP_RDX]] ; entry: %0 = load i32, ptr @n, align 4 @@ -315,22 +318,22 @@ define float @f(ptr nocapture readonly %x) { ; CHECK-LABEL: @f( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP1:%.*]] = load <32 x float>, ptr [[X:%.*]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <32 x float>, ptr [[X:%.*]], align 4 ; CHECK-NEXT: [[ARRAYIDX_32:%.*]] = getelementptr inbounds float, ptr [[X]], i64 32 -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x float>, ptr [[ARRAYIDX_32]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float -0.000000e+00, <32 x float> [[TMP1]]) -; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> [[TMP3]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x float>, ptr [[ARRAYIDX_32]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float -0.000000e+00, <32 x float> [[TMP0]]) +; CHECK-NEXT: [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> [[TMP1]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[TMP3]] ; CHECK-NEXT: ret float [[OP_RDX]] ; ; THRESHOLD-LABEL: @f( ; THRESHOLD-NEXT: entry: -; THRESHOLD-NEXT: [[TMP1:%.*]] = load <32 x float>, ptr [[X:%.*]], align 4 +; THRESHOLD-NEXT: [[TMP0:%.*]] = load <32 x float>, ptr [[X:%.*]], align 4 ; THRESHOLD-NEXT: [[ARRAYIDX_32:%.*]] = getelementptr inbounds float, ptr [[X]], i64 32 -; THRESHOLD-NEXT: [[TMP3:%.*]] = load <16 x float>, ptr [[ARRAYIDX_32]], align 4 -; THRESHOLD-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float -0.000000e+00, <32 x float> [[TMP1]]) -; THRESHOLD-NEXT: [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> [[TMP3]]) -; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[TMP5]] +; THRESHOLD-NEXT: [[TMP1:%.*]] = load <16 x float>, ptr [[ARRAYIDX_32]], align 4 +; THRESHOLD-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float -0.000000e+00, <32 x float> [[TMP0]]) +; THRESHOLD-NEXT: [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> [[TMP1]]) +; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[TMP3]] ; THRESHOLD-NEXT: ret float [[OP_RDX]] ; entry: @@ -484,18 +487,18 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[REM:%.*]] = srem i32 [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[REM]] to float -; CHECK-NEXT: [[TMP1:%.*]] = load <32 x float>, ptr [[X:%.*]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float -0.000000e+00, <32 x float> [[TMP1]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[CONV]] +; CHECK-NEXT: [[TMP0:%.*]] = load <32 x float>, ptr [[X:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float -0.000000e+00, <32 x float> [[TMP0]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP1]], [[CONV]] ; CHECK-NEXT: ret float [[OP_RDX]] ; ; THRESHOLD-LABEL: @f1( ; THRESHOLD-NEXT: entry: ; THRESHOLD-NEXT: [[REM:%.*]] = srem i32 [[A:%.*]], [[B:%.*]] ; THRESHOLD-NEXT: [[CONV:%.*]] = sitofp i32 [[REM]] to float -; THRESHOLD-NEXT: [[TMP1:%.*]] = load <32 x float>, ptr [[X:%.*]], align 4 -; THRESHOLD-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float -0.000000e+00, <32 x float> [[TMP1]]) -; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[CONV]] +; THRESHOLD-NEXT: [[TMP0:%.*]] = load <32 x float>, ptr [[X:%.*]], align 4 +; THRESHOLD-NEXT: [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float -0.000000e+00, <32 x float> [[TMP0]]) +; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP1]], [[CONV]] ; THRESHOLD-NEXT: ret float [[OP_RDX]] ; entry: @@ -603,43 +606,43 @@ ; CHECK-LABEL: @loadadd31( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i64 1 -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x float>, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <16 x float>, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, ptr [[X]], i64 17 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x float>, ptr [[ARRAYIDX_16]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr [[ARRAYIDX_16]], align 4 ; CHECK-NEXT: [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, ptr [[X]], i64 25 -; CHECK-NEXT: [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX_24]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX_24]], align 4 ; CHECK-NEXT: [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, ptr [[X]], i64 29 -; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[ARRAYIDX_28]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX_28]], align 4 ; CHECK-NEXT: [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30 -; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[ARRAYIDX_29]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> [[TMP1]]) -; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP3]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP5]]) -; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP10]] -; CHECK-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[TMP6]] -; CHECK-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], [[TMP7]] +; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX_29]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> [[TMP0]]) +; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP1]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP2]]) +; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP7]] +; CHECK-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[TMP3]] +; CHECK-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], [[TMP4]] ; CHECK-NEXT: ret float [[OP_RDX3]] ; ; THRESHOLD-LABEL: @loadadd31( ; THRESHOLD-NEXT: entry: ; THRESHOLD-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i64 1 -; THRESHOLD-NEXT: [[TMP1:%.*]] = load <16 x float>, ptr [[ARRAYIDX]], align 4 +; THRESHOLD-NEXT: [[TMP0:%.*]] = load <16 x float>, ptr [[ARRAYIDX]], align 4 ; THRESHOLD-NEXT: [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, ptr [[X]], i64 17 -; THRESHOLD-NEXT: [[TMP3:%.*]] = load <8 x float>, ptr [[ARRAYIDX_16]], align 4 +; THRESHOLD-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr [[ARRAYIDX_16]], align 4 ; THRESHOLD-NEXT: [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, ptr [[X]], i64 25 -; THRESHOLD-NEXT: [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX_24]], align 4 +; THRESHOLD-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX_24]], align 4 ; THRESHOLD-NEXT: [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, ptr [[X]], i64 29 -; THRESHOLD-NEXT: [[TMP6:%.*]] = load float, ptr [[ARRAYIDX_28]], align 4 +; THRESHOLD-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX_28]], align 4 ; THRESHOLD-NEXT: [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30 -; THRESHOLD-NEXT: [[TMP7:%.*]] = load float, ptr [[ARRAYIDX_29]], align 4 -; THRESHOLD-NEXT: [[TMP8:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> [[TMP1]]) -; THRESHOLD-NEXT: [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP3]]) -; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP8]], [[TMP9]] -; THRESHOLD-NEXT: [[TMP10:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP5]]) -; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP10]] -; THRESHOLD-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[TMP6]] -; THRESHOLD-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], [[TMP7]] +; THRESHOLD-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX_29]], align 4 +; THRESHOLD-NEXT: [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> [[TMP0]]) +; THRESHOLD-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP1]]) +; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP5]], [[TMP6]] +; THRESHOLD-NEXT: [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP2]]) +; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP7]] +; THRESHOLD-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[TMP3]] +; THRESHOLD-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], [[TMP4]] ; THRESHOLD-NEXT: ret float [[OP_RDX3]] ; entry: @@ -740,27 +743,23 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]] ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr [[X:%.*]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP1]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[CONV]] -; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[CONV]], 3.000000e+00 -; CHECK-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX]], [[OP_RDX1]] -; CHECK-NEXT: ret float [[OP_RDX2]] +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x float>, ptr [[X:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP0]]) +; CHECK-NEXT: [[TMP2:%.*]] = fmul fast float [[CONV]], 2.000000e+00 +; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], 3.000000e+00 +; CHECK-NEXT: ret float [[OP_RDX1]] ; ; THRESHOLD-LABEL: @extra_args( ; THRESHOLD-NEXT: entry: ; THRESHOLD-NEXT: [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]] ; THRESHOLD-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float -; THRESHOLD-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr [[X:%.*]], align 4 -; THRESHOLD-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP1]]) -; THRESHOLD-NEXT: [[TMP3:%.*]] = insertelement <2 x float> , float [[TMP2]], i32 0 -; THRESHOLD-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[CONV]], i32 0 -; THRESHOLD-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> zeroinitializer -; THRESHOLD-NEXT: [[TMP5:%.*]] = fadd fast <2 x float> [[TMP3]], [[SHUFFLE]] -; THRESHOLD-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP5]], i32 0 -; THRESHOLD-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP5]], i32 1 -; THRESHOLD-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[TMP6]], [[TMP7]] -; THRESHOLD-NEXT: ret float [[OP_RDX2]] +; THRESHOLD-NEXT: [[TMP0:%.*]] = load <8 x float>, ptr [[X:%.*]], align 4 +; THRESHOLD-NEXT: [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP0]]) +; THRESHOLD-NEXT: [[TMP2:%.*]] = fmul fast float [[CONV]], 2.000000e+00 +; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP1]], [[TMP2]] +; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], 3.000000e+00 +; THRESHOLD-NEXT: ret float [[OP_RDX1]] ; entry: %mul = mul nsw i32 %b, %a @@ -798,29 +797,28 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]] ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr [[X:%.*]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP1]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], 5.000000e+00 -; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[CONV]], [[CONV]] -; CHECK-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX]], 8.000000e+00 -; CHECK-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], [[OP_RDX1]] -; CHECK-NEXT: ret float [[OP_RDX3]] +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x float>, ptr [[X:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP0]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP1]], 1.300000e+01 +; CHECK-NEXT: [[TMP2:%.*]] = fmul fast float [[CONV]], 2.000000e+00 +; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP2]] +; CHECK-NEXT: ret float [[OP_RDX1]] ; ; THRESHOLD-LABEL: @extra_args_same_several_times( ; THRESHOLD-NEXT: entry: ; THRESHOLD-NEXT: [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]] ; THRESHOLD-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float -; THRESHOLD-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr [[X:%.*]], align 4 -; THRESHOLD-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP1]]) -; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], 5.000000e+00 -; THRESHOLD-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[OP_RDX]], i32 0 -; THRESHOLD-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[CONV]], i32 1 -; THRESHOLD-NEXT: [[TMP5:%.*]] = insertelement <2 x float> , float [[CONV]], i32 1 -; THRESHOLD-NEXT: [[TMP6:%.*]] = fadd fast <2 x float> [[TMP4]], [[TMP5]] +; THRESHOLD-NEXT: [[TMP0:%.*]] = load <8 x float>, ptr [[X:%.*]], align 4 +; THRESHOLD-NEXT: [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP0]]) +; THRESHOLD-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0 +; THRESHOLD-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[CONV]], i32 1 +; THRESHOLD-NEXT: [[TMP4:%.*]] = fadd fast <2 x float> [[TMP3]], +; THRESHOLD-NEXT: [[TMP5:%.*]] = fmul fast <2 x float> [[TMP3]], +; THRESHOLD-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> ; THRESHOLD-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP6]], i32 0 ; THRESHOLD-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP6]], i32 1 -; THRESHOLD-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[TMP7]], [[TMP8]] -; THRESHOLD-NEXT: ret float [[OP_RDX3]] +; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[TMP7]], [[TMP8]] +; THRESHOLD-NEXT: ret float [[OP_RDX1]] ; entry: %mul = mul nsw i32 %b, %a @@ -861,31 +859,26 @@ ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]] ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float ; CHECK-NEXT: [[CONVC:%.*]] = sitofp i32 [[C:%.*]] to float -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr [[X:%.*]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP1]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[CONV]] -; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[CONV]], [[CONVC]] -; CHECK-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX]], [[OP_RDX1]] -; CHECK-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], 3.000000e+00 -; CHECK-NEXT: ret float [[OP_RDX3]] +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x float>, ptr [[X:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP0]]) +; CHECK-NEXT: [[TMP2:%.*]] = fmul fast float [[CONV]], 2.000000e+00 +; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONVC]] +; CHECK-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], 3.000000e+00 +; CHECK-NEXT: ret float [[OP_RDX2]] ; ; THRESHOLD-LABEL: @extra_args_no_replace( ; THRESHOLD-NEXT: entry: ; THRESHOLD-NEXT: [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]] ; THRESHOLD-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float ; THRESHOLD-NEXT: [[CONVC:%.*]] = sitofp i32 [[C:%.*]] to float -; THRESHOLD-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr [[X:%.*]], align 4 -; THRESHOLD-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP1]]) -; THRESHOLD-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0 -; THRESHOLD-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[CONVC]], i32 1 -; THRESHOLD-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[CONV]], i32 0 -; THRESHOLD-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <2 x i32> zeroinitializer -; THRESHOLD-NEXT: [[TMP6:%.*]] = fadd fast <2 x float> [[TMP4]], [[SHUFFLE]] -; THRESHOLD-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP6]], i32 0 -; THRESHOLD-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP6]], i32 1 -; THRESHOLD-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[TMP7]], [[TMP8]] -; THRESHOLD-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], 3.000000e+00 -; THRESHOLD-NEXT: ret float [[OP_RDX3]] +; THRESHOLD-NEXT: [[TMP0:%.*]] = load <8 x float>, ptr [[X:%.*]], align 4 +; THRESHOLD-NEXT: [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP0]]) +; THRESHOLD-NEXT: [[TMP2:%.*]] = fmul fast float [[CONV]], 2.000000e+00 +; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP1]], [[TMP2]] +; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONVC]] +; THRESHOLD-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], 3.000000e+00 +; THRESHOLD-NEXT: ret float [[OP_RDX2]] ; entry: %mul = mul nsw i32 %b, %a @@ -976,32 +969,32 @@ ; CHECK-LABEL: @wobble( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[ARG:%.*]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[BAR:%.*]], i32 0 -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i32> [[SHUFFLE]], [[SHUFFLE1]] -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 -; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <4 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = sext <4 x i1> [[TMP4]] to <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP6]], [[TMP3]] -; CHECK-NEXT: [[OP_RDX2:%.*]] = add i32 [[OP_RDX]], [[ARG]] -; CHECK-NEXT: ret i32 [[OP_RDX2]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[BAR:%.*]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = xor <4 x i32> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <4 x i32> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = sext <4 x i1> [[TMP6]] to <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP7]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP8]], [[TMP5]] +; CHECK-NEXT: [[OP_RDX1:%.*]] = add i32 [[OP_RDX]], [[ARG]] +; CHECK-NEXT: ret i32 [[OP_RDX1]] ; ; THRESHOLD-LABEL: @wobble( ; THRESHOLD-NEXT: bb: ; THRESHOLD-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[ARG:%.*]], i32 0 -; THRESHOLD-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer -; THRESHOLD-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[BAR:%.*]], i32 0 -; THRESHOLD-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer -; THRESHOLD-NEXT: [[TMP2:%.*]] = xor <4 x i32> [[SHUFFLE]], [[SHUFFLE1]] -; THRESHOLD-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 -; THRESHOLD-NEXT: [[TMP4:%.*]] = icmp eq <4 x i32> [[TMP2]], zeroinitializer -; THRESHOLD-NEXT: [[TMP5:%.*]] = sext <4 x i1> [[TMP4]] to <4 x i32> -; THRESHOLD-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]]) -; THRESHOLD-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP6]], [[TMP3]] -; THRESHOLD-NEXT: [[OP_RDX2:%.*]] = add i32 [[OP_RDX]], [[ARG]] -; THRESHOLD-NEXT: ret i32 [[OP_RDX2]] +; THRESHOLD-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer +; THRESHOLD-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[BAR:%.*]], i32 0 +; THRESHOLD-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer +; THRESHOLD-NEXT: [[TMP4:%.*]] = xor <4 x i32> [[TMP1]], [[TMP3]] +; THRESHOLD-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3 +; THRESHOLD-NEXT: [[TMP6:%.*]] = icmp eq <4 x i32> [[TMP4]], zeroinitializer +; THRESHOLD-NEXT: [[TMP7:%.*]] = sext <4 x i1> [[TMP6]] to <4 x i32> +; THRESHOLD-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP7]]) +; THRESHOLD-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP8]], [[TMP5]] +; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = add i32 [[OP_RDX]], [[ARG]] +; THRESHOLD-NEXT: ret i32 [[OP_RDX1]] ; bb: %x1 = xor i32 %arg, %bar diff --git a/llvm/test/Transforms/SLPVectorizer/X86/malformed_phis.ll b/llvm/test/Transforms/SLPVectorizer/X86/malformed_phis.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/malformed_phis.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/malformed_phis.ll @@ -62,13 +62,9 @@ ; CHECK: bb2: ; CHECK-NEXT: [[TMP:%.*]] = phi i32 [ undef, [[BB:%.*]] ], [ undef, [[BB2]] ] ; CHECK-NEXT: [[TMP3:%.*]] = phi i32 [ 0, [[BB]] ], [ undef, [[BB2]] ] -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> poison, i32 [[TMP]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef) -; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP1]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP2]], [[TMP3]] -; CHECK-NEXT: [[OP_RDX1:%.*]] = add i32 [[OP_RDX]], undef -; CHECK-NEXT: call void @use(i32 [[OP_RDX1]]) +; CHECK-NEXT: [[TMP0:%.*]] = mul i32 [[TMP]], 8 +; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 undef, [[TMP0]] +; CHECK-NEXT: call void @use(i32 [[OP_RDX]]) ; CHECK-NEXT: br label [[BB2]] ; bb: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll @@ -6,48 +6,21 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[A:%.*]] = getelementptr [1000 x i64], ptr null, i64 0, i64 5 ; CHECK-NEXT: [[A1:%.*]] = getelementptr [1000 x i64], ptr null, i64 0, i64 6 -; CHECK-NEXT: [[A2:%.*]] = getelementptr [1000 x i64], ptr null, i64 0, i64 7 -; CHECK-NEXT: [[A3:%.*]] = getelementptr [1000 x i64], ptr null, i64 0, i64 8 ; CHECK-NEXT: br label [[WHILE:%.*]] ; CHECK: while: -; CHECK-NEXT: [[PH:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX12:%.*]], [[WHILE]] ] +; CHECK-NEXT: [[PH:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX25:%.*]], [[WHILE]] ] ; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr null, align 8 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[A2]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr null, align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i64>, ptr [[A]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i64> poison, i64 [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> poison, <16 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x i64> [[TMP6]], <16 x i64> [[TMP8]], <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> poison, <16 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <16 x i64> [[TMP9]], <16 x i64> [[TMP10]], <16 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> poison, <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <16 x i64> [[TMP11]], <16 x i64> [[TMP8]], <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <16 x i64> [[TMP13]], i64 [[TMP0]], i32 9 -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <16 x i64> [[TMP14]], i64 [[TMP0]], i32 10 -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <16 x i64> [[TMP15]], i64 [[TMP0]], i32 11 -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <16 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <16 x i64> [[TMP16]], <16 x i64> [[TMP17]], <16 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = load i64, ptr [[A1]], align 16 -; CHECK-NEXT: [[TMP20:%.*]] = load i64, ptr [[A2]], align 8 -; CHECK-NEXT: [[TMP21:%.*]] = load i64, ptr [[A3]], align 16 -; CHECK-NEXT: [[TMP22:%.*]] = call i64 @llvm.vector.reduce.xor.v16i64(<16 x i64> [[TMP18]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = xor i64 [[TMP22]], [[TMP3]] -; CHECK-NEXT: [[OP_RDX1:%.*]] = xor i64 [[TMP3]], [[TMP3]] -; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i64> [[TMP5]], i32 3 -; CHECK-NEXT: [[OP_RDX2:%.*]] = xor i64 [[TMP3]], [[TMP23]] -; CHECK-NEXT: [[OP_RDX3:%.*]] = xor i64 [[TMP23]], [[TMP19]] -; CHECK-NEXT: [[OP_RDX4:%.*]] = xor i64 [[TMP19]], [[TMP19]] -; CHECK-NEXT: [[OP_RDX5:%.*]] = xor i64 [[TMP20]], [[TMP20]] -; CHECK-NEXT: [[OP_RDX6:%.*]] = xor i64 [[TMP21]], [[TMP21]] -; CHECK-NEXT: [[OP_RDX7:%.*]] = xor i64 [[OP_RDX]], [[OP_RDX1]] -; CHECK-NEXT: [[OP_RDX8:%.*]] = xor i64 [[OP_RDX2]], [[OP_RDX3]] -; CHECK-NEXT: [[OP_RDX9:%.*]] = xor i64 [[OP_RDX4]], [[OP_RDX5]] -; CHECK-NEXT: [[OP_RDX10:%.*]] = xor i64 [[OP_RDX7]], [[OP_RDX8]] -; CHECK-NEXT: [[OP_RDX11:%.*]] = xor i64 [[OP_RDX9]], [[OP_RDX6]] -; CHECK-NEXT: [[OP_RDX12]] = xor i64 [[OP_RDX10]], [[OP_RDX11]] +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr null, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[A]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[A1]], align 16 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> poison, <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> zeroinitializer, <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> [[TMP7]]) +; CHECK-NEXT: [[OP_RDX23:%.*]] = xor i64 0, [[TMP1]] +; CHECK-NEXT: [[OP_RDX24:%.*]] = xor i64 [[TMP0]], [[TMP8]] +; CHECK-NEXT: [[OP_RDX25]] = xor i64 [[OP_RDX23]], [[OP_RDX24]] ; CHECK-NEXT: br label [[WHILE]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-value-in-tree.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-value-in-tree.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-value-in-tree.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-value-in-tree.ll @@ -6,11 +6,7 @@ ; CHECK-NEXT: br i1 false, label [[PH:%.*]], label [[EXIT:%.*]] ; CHECK: ph: ; CHECK-NEXT: [[TMP0:%.*]] = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> zeroinitializer) -; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> zeroinitializer) -; CHECK-NEXT: [[OP_RDX:%.*]] = and i8 [[TMP0]], [[TMP1]] -; CHECK-NEXT: [[TMP2:%.*]] = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> zeroinitializer) -; CHECK-NEXT: [[OP_RDX1:%.*]] = and i8 [[OP_RDX]], [[TMP2]] -; CHECK-NEXT: [[OP_RDX2:%.*]] = and i8 [[OP_RDX1]], 0 +; CHECK-NEXT: [[OP_RDX2:%.*]] = and i8 0, [[TMP0]] ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: ; CHECK-NEXT: [[PHI:%.*]] = phi i8 [ [[OP_RDX2]], [[PH]] ], [ 0, [[BB:%.*]] ] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll @@ -15,8 +15,8 @@ ; CHECK-NEXT: [[TMP1:%.*]] = sext <2 x i16> [[TMP0]] to <2 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = sub nsw <2 x i32> , [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = sub <2 x i32> [[TMP2]], undef -; CHECK-NEXT: [[SHUFFLE4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[SHUFFLE4]], +; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[SHUFFLE2]], ; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP4]]) ; CHECK-NEXT: [[T19:%.*]] = select i1 undef, i32 [[TMP5]], i32 undef ; CHECK-NEXT: [[T20:%.*]] = icmp sgt i32 [[T19]], 63 @@ -24,13 +24,10 @@ ; CHECK-NEXT: [[TMP7:%.*]] = sub <2 x i32> [[TMP6]], undef ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[SHUFFLE]], -; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef) -; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP8]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = icmp slt i32 [[TMP9]], [[TMP10]] -; CHECK-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP9]], i32 [[TMP10]] -; CHECK-NEXT: [[OP_RDX2:%.*]] = icmp slt i32 [[OP_RDX1]], undef -; CHECK-NEXT: [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[OP_RDX1]], i32 undef -; CHECK-NEXT: [[T45:%.*]] = icmp sgt i32 undef, [[OP_RDX3]] +; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP8]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = icmp slt i32 undef, [[TMP9]] +; CHECK-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 undef, i32 [[TMP9]] +; CHECK-NEXT: [[T45:%.*]] = icmp sgt i32 undef, [[OP_RDX1]] ; CHECK-NEXT: unreachable ; bb: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/revectorized_rdx_crash.ll b/llvm/test/Transforms/SLPVectorizer/X86/revectorized_rdx_crash.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/revectorized_rdx_crash.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/revectorized_rdx_crash.ll @@ -19,23 +19,19 @@ ; CHECK: for.cond.preheader: ; CHECK-NEXT: [[I:%.*]] = getelementptr inbounds [100 x i32], ptr undef, i64 0, i64 2 ; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds [100 x i32], ptr undef, i64 0, i64 3 -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[I]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]]) -; CHECK-NEXT: [[OP_RDX7:%.*]] = add i32 [[TMP2]], undef -; CHECK-NEXT: [[OP_RDX8:%.*]] = add i32 [[OP_RDX7]], undef -; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr [[I1]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]]) -; CHECK-NEXT: [[OP_RDX5:%.*]] = add i32 [[TMP5]], undef -; CHECK-NEXT: [[OP_RDX6:%.*]] = add i32 [[OP_RDX5]], undef -; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef) -; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP6]], undef -; CHECK-NEXT: [[OP_RDX1:%.*]] = add i32 [[OP_RDX8]], [[OP_RDX8]] -; CHECK-NEXT: [[OP_RDX2:%.*]] = add i32 [[OP_RDX6]], [[OP_RDX6]] -; CHECK-NEXT: [[OP_RDX3:%.*]] = add i32 [[OP_RDX]], [[OP_RDX1]] -; CHECK-NEXT: [[OP_RDX4:%.*]] = add i32 [[OP_RDX3]], [[OP_RDX2]] +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[I]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP0]]) +; CHECK-NEXT: [[OP_RDX3:%.*]] = add i32 [[TMP1]], undef +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[I1]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP2]]) +; CHECK-NEXT: [[OP_RDX2:%.*]] = add i32 [[TMP3]], undef +; CHECK-NEXT: [[TMP4:%.*]] = mul i32 [[OP_RDX3]], 2 +; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 undef, [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = mul i32 [[OP_RDX2]], 2 +; CHECK-NEXT: [[OP_RDX1:%.*]] = add i32 [[OP_RDX]], [[TMP5]] ; CHECK-NEXT: br label [[IF_END]] ; CHECK: if.end: -; CHECK-NEXT: [[R:%.*]] = phi i32 [ [[OP_RDX4]], [[FOR_COND_PREHEADER]] ], [ undef, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[R:%.*]] = phi i32 [ [[OP_RDX1]], [[FOR_COND_PREHEADER]] ], [ undef, [[ENTRY:%.*]] ] ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scalarization-overhead.ll b/llvm/test/Transforms/SLPVectorizer/X86/scalarization-overhead.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/scalarization-overhead.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/scalarization-overhead.ll @@ -6,24 +6,13 @@ define i16 @D134605() { ; CHECK-LABEL: @D134605( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX81:%.*]] = getelementptr inbounds [32 x i16], ptr poison, i16 0, i16 3 -; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[ARRAYIDX81]], align 1 -; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr poison, align 1 -; CHECK-NEXT: [[ARRAYIDX101:%.*]] = getelementptr inbounds [32 x i16], ptr poison, i16 0, i16 1 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr [[ARRAYIDX101]], align 1 -; CHECK-NEXT: [[ARRAYIDX107:%.*]] = getelementptr inbounds [32 x i16], ptr poison, i16 0, i16 2 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr [[ARRAYIDX107]], align 1 -; CHECK-NEXT: [[REASS_ADD:%.*]] = add i16 poison, [[TMP0]] -; CHECK-NEXT: [[ADD116:%.*]] = add i16 [[TMP1]], [[TMP0]] -; CHECK-NEXT: [[ADD122:%.*]] = add i16 [[ADD116]], [[TMP2]] -; CHECK-NEXT: [[ADD124:%.*]] = add i16 [[ADD122]], [[TMP3]] -; CHECK-NEXT: [[ADD125:%.*]] = add i16 [[ADD124]], poison -; CHECK-NEXT: [[FACTOR2531:%.*]] = add i16 [[TMP3]], [[ADD125]] -; CHECK-NEXT: [[ADD14332:%.*]] = add i16 [[FACTOR2531]], [[TMP2]] -; CHECK-NEXT: [[ADD14933:%.*]] = add i16 [[ADD14332]], [[TMP1]] -; CHECK-NEXT: [[ADD15534:%.*]] = add i16 [[ADD14933]], [[TMP0]] -; CHECK-NEXT: [[ADD15935:%.*]] = add i16 [[ADD15534]], poison -; CHECK-NEXT: [[REASS_MUL24:%.*]] = shl i16 [[ADD15935]], 2 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr poison, align 1 +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[TMP0]], i32 3 +; CHECK-NEXT: [[REASS_ADD:%.*]] = add i16 poison, [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> [[TMP0]]) +; CHECK-NEXT: [[TMP3:%.*]] = mul i16 [[TMP2]], 2 +; CHECK-NEXT: [[OP_RDX:%.*]] = add i16 [[TMP3]], poison +; CHECK-NEXT: [[REASS_MUL24:%.*]] = shl i16 [[OP_RDX]], 2 ; CHECK-NEXT: [[CALL:%.*]] = call i16 @check_i16(i16 noundef 1, i16 noundef [[REASS_MUL24]], i16 noundef 5120) ; CHECK-NEXT: unreachable ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/slp-schedule-use-order.ll b/llvm/test/Transforms/SLPVectorizer/X86/slp-schedule-use-order.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/slp-schedule-use-order.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/slp-schedule-use-order.ll @@ -6,16 +6,14 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i1> [ [[TMP8:%.*]], [[TMP1:%.*]] ], [ zeroinitializer, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i1> [ [[TMP6:%.*]], [[TMP1:%.*]] ], [ zeroinitializer, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[TMP1]] ; CHECK: 1: ; CHECK-NEXT: [[TMP2:%.*]] = zext <2 x i1> [[TMP0]] to <2 x i8> -; CHECK-NEXT: [[TMP3:%.*]] = or i8 0, 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i8> , i8 [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = and <2 x i8> [[TMP4]], [[TMP2]] -; CHECK-NEXT: [[TMP6:%.*]] = or <2 x i8> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = icmp ne <2 x i8> [[TMP6]], zeroinitializer -; CHECK-NEXT: [[TMP8]] = and <2 x i1> [[TMP7]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = and <2 x i8> zeroinitializer, [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = or <2 x i8> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <2 x i8> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6]] = and <2 x i1> [[TMP5]], zeroinitializer ; CHECK-NEXT: br label [[FOR_BODY]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/undef_vect.ll b/llvm/test/Transforms/SLPVectorizer/X86/undef_vect.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/undef_vect.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/undef_vect.ll @@ -11,9 +11,7 @@ ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> [[TMP1]]) ; CHECK-NEXT: [[OP_RDX:%.*]] = icmp sgt i32 [[TMP2]], undef ; CHECK-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP2]], i32 undef -; CHECK-NEXT: [[OP_RDX2:%.*]] = icmp sgt i32 [[OP_RDX1]], undef -; CHECK-NEXT: [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[OP_RDX1]], i32 undef -; CHECK-NEXT: [[DOTSROA_SPECULATED_9:%.*]] = select i1 undef, i32 undef, i32 [[OP_RDX3]] +; CHECK-NEXT: [[DOTSROA_SPECULATED_9:%.*]] = select i1 undef, i32 undef, i32 [[OP_RDX1]] ; CHECK-NEXT: [[CMP_I1_10:%.*]] = icmp slt i32 [[DOTSROA_SPECULATED_9]], undef ; CHECK-NEXT: ret void ;