diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -11192,7 +11192,12 @@ unsigned NumReducedVals = std::accumulate( ReducedVals.begin(), ReducedVals.end(), 0, [](int Num, ArrayRef Vals) { return Num + Vals.size(); }); - if (NumReducedVals < ReductionLimit) + if (NumReducedVals < ReductionLimit && + all_of(ReducedVals, [](ArrayRef RedV) { + return RedV.size() < 2 || + !all_of(RedV, [&](Value *V) { return isConstant(V); }) || + !isSplat(RedV); + })) return nullptr; IRBuilder<> Builder(cast(ReductionRoot)); @@ -11286,10 +11291,75 @@ } } } + + // Emit code for constant values. + if (Candidates.size() > 1 && all_of(Candidates, isConstant)) { + Value *Res = Candidates.front(); + ++VectorizedVals.try_emplace(Candidates.front(), 0).first->getSecond(); + for (Value *V : makeArrayRef(Candidates).drop_front()) { + Res = createOp(Builder, RdxKind, Res, V, "const.rdx", ReductionOps); + ++VectorizedVals.try_emplace(V, 0).first->getSecond(); + } + if (!VectorizedTree) { + // Initialize the final value in the reduction. + VectorizedTree = Res; + } else { + // Update the final value in the reduction. + Builder.SetCurrentDebugLocation( + cast(ReductionOps.front().front())->getDebugLoc()); + VectorizedTree = createOp(Builder, RdxKind, VectorizedTree, Res, + "op.rdx", ReductionOps); + } + continue; + } + unsigned NumReducedVals = Candidates.size(); - if (NumReducedVals < ReductionLimit) + if (NumReducedVals < ReductionLimit && + (NumReducedVals < 2 || !isSplat(Candidates))) continue; + // Gather same values. + MapVector SameValuesCounter; + for (Value *V : Candidates) + ++SameValuesCounter.insert(std::make_pair(V, 0)).first->second; + // Check if we have repeated values. + bool IsSupportedReusedRdxOp = RdxKind != RecurKind::Mul && + RdxKind != RecurKind::FMul && + RdxKind != RecurKind::FMulAdd; + bool SameScaleFactor = false; + bool HasReusedScalars = SameValuesCounter.size() != Candidates.size(); + if (IsSupportedReusedRdxOp && HasReusedScalars) { + SameScaleFactor = + (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd || + RdxKind == RecurKind::Xor) && + all_of(SameValuesCounter, + [&SameValuesCounter](const std::pair &P) { + return P.second == SameValuesCounter.front().second; + }); + Candidates.resize(SameValuesCounter.size()); + transform(SameValuesCounter, Candidates.begin(), + [](const auto &P) { return P.first; }); + NumReducedVals = Candidates.size(); + // Have a reduction of the same element. + if (NumReducedVals == 1) { + Value *RedVal = emitReusedOps(Candidates.front(), Builder, None, + SameValuesCounter, TrackedToOrig); + if (!VectorizedTree) { + // Initialize the final value in the reduction. + VectorizedTree = RedVal; + } else { + // Update the final value in the reduction. + Builder.SetCurrentDebugLocation( + cast(ReductionOps.front().front())->getDebugLoc()); + VectorizedTree = createOp(Builder, RdxKind, VectorizedTree, RedVal, + "op.rdx", ReductionOps); + } + Value *OrigV = TrackedToOrig.find(Candidates.front())->second; + VectorizedVals.try_emplace(OrigV, SameValuesCounter[OrigV]); + continue; + } + } + unsigned MaxVecRegSize = V.getMaxVecRegSize(); unsigned EltSize = V.getVectorElementSize(Candidates[0]); unsigned MaxElts = RegMaxNumber * PowerOf2Floor(MaxVecRegSize / EltSize); @@ -11318,6 +11388,7 @@ ReduxWidth /= 2; return IsAnyRedOpGathered; }; + bool AnyVectorized = false; while (Pos < NumReducedVals - ReduxWidth + 1 && ReduxWidth >= ReductionLimit) { // Dependency in tree of the reduction ops - drop this attempt, try @@ -11370,19 +11441,23 @@ LocalExternallyUsedValues[TrackedVals[V]]; }); } - // Number of uses of the candidates in the vector of values. - SmallDenseMap NumUses; - for (unsigned Cnt = 0; Cnt < Pos; ++Cnt) { - Value *V = Candidates[Cnt]; - if (NumUses.count(V) > 0) - continue; - NumUses[V] = std::count(VL.begin(), VL.end(), V); - } - for (unsigned Cnt = Pos + ReduxWidth; Cnt < NumReducedVals; ++Cnt) { - Value *V = Candidates[Cnt]; - if (NumUses.count(V) > 0) - continue; - NumUses[V] = std::count(VL.begin(), VL.end(), V); + if (!IsSupportedReusedRdxOp && HasReusedScalars) { + // Number of uses of the candidates in the vector of values. + SameValuesCounter.clear(); + for (unsigned Cnt = 0; Cnt < Pos; ++Cnt) { + Value *V = Candidates[Cnt]; + if (SameValuesCounter.count(V) > 0) + continue; + Value *OrigV = TrackedToOrig.find(V)->second; + SameValuesCounter[OrigV] = std::count(VL.begin(), VL.end(), V); + } + for (unsigned Cnt = Pos + ReduxWidth; Cnt < NumReducedVals; ++Cnt) { + Value *V = Candidates[Cnt]; + if (SameValuesCounter.count(V) > 0) + continue; + Value *OrigV = TrackedToOrig.find(V)->second; + SameValuesCounter[OrigV] = std::count(VL.begin(), VL.end(), V); + } } // Gather externally used values. SmallPtrSet Visited; @@ -11390,7 +11465,8 @@ Value *V = Candidates[Cnt]; if (!Visited.insert(V).second) continue; - unsigned NumOps = VectorizedVals.lookup(V) + NumUses[V]; + Value *OrigV = TrackedToOrig.find(V)->second; + unsigned NumOps = VectorizedVals.lookup(V) + SameValuesCounter[OrigV]; if (NumOps != ReducedValsToOps.find(V)->second.size()) LocalExternallyUsedValues[V]; } @@ -11398,7 +11474,8 @@ Value *V = Candidates[Cnt]; if (!Visited.insert(V).second) continue; - unsigned NumOps = VectorizedVals.lookup(V) + NumUses[V]; + Value *OrigV = TrackedToOrig.find(V)->second; + unsigned NumOps = VectorizedVals.lookup(V) + SameValuesCounter[OrigV]; if (NumOps != ReducedValsToOps.find(V)->second.size()) LocalExternallyUsedValues[V]; } @@ -11466,9 +11543,28 @@ if (isBoolLogicOp(RdxRootInst)) VectorizedRoot = Builder.CreateFreeze(VectorizedRoot); + // Emit code to correctly handle reused reduced values, if required. + if (IsSupportedReusedRdxOp && HasReusedScalars && !SameScaleFactor) { + VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, VL, + SameValuesCounter, TrackedToOrig); + } + Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI); + // Improved analysis for add/fadd/xor reductions with same scale factor + // for all operands of reductions. We can emit scalar ops for them + // instead. + if (IsSupportedReusedRdxOp && HasReusedScalars && SameScaleFactor) { + MapVector RedCounter; + RedCounter.insert( + std::make_pair(ReducedSubTree, SameValuesCounter.front().second)); + DenseMap RedToOrig( + {{ReducedSubTree, ReducedSubTree}}); + ReducedSubTree = emitReusedOps(ReducedSubTree, Builder, None, + RedCounter, RedToOrig); + } + if (!VectorizedTree) { // Initialize the final value in the reduction. VectorizedTree = ReducedSubTree; @@ -11480,12 +11576,37 @@ ReducedSubTree, "op.rdx", ReductionOps); } // Count vectorized reduced values to exclude them from final reduction. - for (Value *V : VL) - ++VectorizedVals.try_emplace(TrackedToOrig.find(V)->second, 0) - .first->getSecond(); + for (Value *V : VL) { + Value *OrigV = TrackedToOrig.find(V)->second; + if (IsSupportedReusedRdxOp) { + VectorizedVals.try_emplace(OrigV, SameValuesCounter[OrigV]); + continue; + } + ++VectorizedVals.try_emplace(OrigV, 0).first->getSecond(); + } Pos += ReduxWidth; Start = Pos; ReduxWidth = PowerOf2Floor(NumReducedVals - Pos); + AnyVectorized = true; + } + if (IsSupportedReusedRdxOp && HasReusedScalars && !AnyVectorized) { + for (const std::pair &P : SameValuesCounter) { + Value *RedVal = emitReusedOps(P.first, Builder, None, + SameValuesCounter, TrackedToOrig); + if (!VectorizedTree) { + // Initialize the final value in the reduction. + VectorizedTree = RedVal; + } else { + // Update the final value in the reduction. + Builder.SetCurrentDebugLocation( + cast(ReductionOps.front().front())->getDebugLoc()); + VectorizedTree = createOp(Builder, RdxKind, VectorizedTree, RedVal, + "op.rdx", ReductionOps); + } + Value *OrigV = TrackedToOrig.find(P.first)->second; + VectorizedVals.try_emplace(OrigV, P.second); + } + continue; } } if (VectorizedTree) { @@ -11713,6 +11834,96 @@ ++NumVectorInstructions; return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind); } + + Value *emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder, + ArrayRef VL, + const MapVector &SameValuesCounter, + const DenseMap &TrackedToOrig) { + switch (RdxKind) { + case RecurKind::Add: { + // root = mul prev_root, <1, 1, n, 1> + if (VL.empty()) { + unsigned Cnt = SameValuesCounter.lookup( + TrackedToOrig.find(VectorizedValue)->second); + Value *Scale = ConstantInt::get(VectorizedValue->getType(), Cnt); + return Builder.CreateMul(VectorizedValue, Scale); + } + SmallVector Vals; + for (Value *V : VL) { + unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second); + Vals.push_back(ConstantInt::get(V->getType(), Cnt, /*IsSigned=*/false)); + } + auto *Scale = ConstantVector::get(Vals); + return Builder.CreateMul(VectorizedValue, Scale); + } + case RecurKind::And: + case RecurKind::Or: + // No need for multiple or/and(s). + return VectorizedValue; + case RecurKind::SMax: + case RecurKind::SMin: + case RecurKind::UMax: + case RecurKind::UMin: + case RecurKind::FMax: + case RecurKind::FMin: + // No need for multiple min/max(s) of the same value. + return VectorizedValue; + case RecurKind::Xor: { + // Replace values with even number of repeats with 0, since + // x xor x = 0. + // root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6, + // 7>, if elements 4th and 6th elements have even number of repeats. + if (VL.empty()) { + unsigned Cnt = SameValuesCounter.lookup( + TrackedToOrig.find(VectorizedValue)->second); + if (Cnt % 2 == 0) + return Constant::getNullValue(VectorizedValue->getType()); + return VectorizedValue; + } + SmallVector Mask( + cast(VectorizedValue->getType())->getNumElements(), + UndefMaskElem); + std::iota(Mask.begin(), Mask.end(), 0); + bool NeedShuffle = false; + for (unsigned I = 0, VF = VL.size(); I < VF; ++I) { + Value *V = VL[I]; + unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second); + if (Cnt % 2 == 0) { + Mask[I] = VF; + NeedShuffle = true; + } + } + if (NeedShuffle) + VectorizedValue = Builder.CreateShuffleVector( + VectorizedValue, + ConstantVector::getNullValue(VectorizedValue->getType()), Mask); + return VectorizedValue; + } + case RecurKind::FAdd: { + // root = fmul prev_root, <1.0, 1.0, n.0, 1.0> + if (VL.empty()) { + unsigned Cnt = SameValuesCounter.lookup( + TrackedToOrig.find(VectorizedValue)->second); + Value *Scale = ConstantFP::get(VectorizedValue->getType(), Cnt); + return Builder.CreateFMul(VectorizedValue, Scale); + } + SmallVector Vals; + for (Value *V : VL) { + unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second); + Vals.push_back(ConstantFP::get(V->getType(), Cnt)); + } + auto *Scale = ConstantVector::get(Vals); + return Builder.CreateFMul(VectorizedValue, Scale); + } + case RecurKind::Mul: + case RecurKind::FMul: + case RecurKind::FMulAdd: + case RecurKind::SelectICmp: + case RecurKind::SelectFCmp: + case RecurKind::None: + llvm_unreachable("Unexpected reduction kind for reused scalars."); + } + } }; } // end anonymous namespace diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll @@ -5,56 +5,36 @@ define void @Test(i32) { ; CHECK-LABEL: @Test( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[TMP0:%.*]], i32 0 -; CHECK-NEXT: [[SHUFFLE7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x i32> poison, i32 [[TMP0]], i32 0 -; CHECK-NEXT: [[SHUFFLE6:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[TMP3:%.*]] = phi <2 x i32> [ [[TMP14:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[SHUFFLE]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = add <8 x i32> [[SHUFFLE]], -; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[SHUFFLE6]]) -; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[SHUFFLE7]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP5]]) -; CHECK-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[TMP8]] -; CHECK-NEXT: [[OP_RDX2:%.*]] = and i32 [[OP_RDX1]], [[TMP0]] -; CHECK-NEXT: [[OP_RDX3:%.*]] = and i32 [[TMP0]], [[TMP0]] -; CHECK-NEXT: [[OP_RDX4:%.*]] = and i32 [[OP_RDX2]], [[OP_RDX3]] -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> , i32 [[OP_RDX4]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i32> poison, i32 [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i32> [[TMP10]], i32 [[TMP4]], i32 1 -; CHECK-NEXT: [[TMP12:%.*]] = and <2 x i32> [[TMP9]], [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = add <2 x i32> [[TMP9]], [[TMP11]] -; CHECK-NEXT: [[TMP14]] = shufflevector <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP10:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[SHUFFLE]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> [[SHUFFLE]], +; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP0:%.*]], [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> , i32 [[OP_RDX]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = and <2 x i32> [[TMP5]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i32> [[TMP5]], [[TMP7]] +; CHECK-NEXT: [[TMP10]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], <2 x i32> ; CHECK-NEXT: br label [[LOOP]] ; ; FORCE_REDUCTION-LABEL: @Test( ; FORCE_REDUCTION-NEXT: entry: -; FORCE_REDUCTION-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[TMP0:%.*]], i32 0 -; FORCE_REDUCTION-NEXT: [[SHUFFLE7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer -; FORCE_REDUCTION-NEXT: [[TMP2:%.*]] = insertelement <16 x i32> poison, i32 [[TMP0]], i32 0 -; FORCE_REDUCTION-NEXT: [[SHUFFLE6:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> poison, <16 x i32> zeroinitializer ; FORCE_REDUCTION-NEXT: br label [[LOOP:%.*]] ; FORCE_REDUCTION: loop: -; FORCE_REDUCTION-NEXT: [[TMP3:%.*]] = phi <2 x i32> [ [[TMP10:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ] -; FORCE_REDUCTION-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <8 x i32> -; FORCE_REDUCTION-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[SHUFFLE]], i32 1 -; FORCE_REDUCTION-NEXT: [[TMP5:%.*]] = add <8 x i32> [[SHUFFLE]], -; FORCE_REDUCTION-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[SHUFFLE6]]) -; FORCE_REDUCTION-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[SHUFFLE7]]) -; FORCE_REDUCTION-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP6]], [[TMP7]] -; FORCE_REDUCTION-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP5]]) -; FORCE_REDUCTION-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[TMP8]] -; FORCE_REDUCTION-NEXT: [[OP_RDX2:%.*]] = and i32 [[OP_RDX1]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_RDX3:%.*]] = and i32 [[TMP0]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_RDX4:%.*]] = and i32 [[OP_RDX2]], [[OP_RDX3]] -; FORCE_REDUCTION-NEXT: [[OP_RDX5:%.*]] = and i32 [[OP_RDX4]], [[TMP4]] -; FORCE_REDUCTION-NEXT: [[VAL_43:%.*]] = add i32 [[TMP4]], 14910 -; FORCE_REDUCTION-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[OP_RDX5]], i32 0 -; FORCE_REDUCTION-NEXT: [[TMP10]] = insertelement <2 x i32> [[TMP9]], i32 [[VAL_43]], i32 1 +; FORCE_REDUCTION-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP6:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ] +; FORCE_REDUCTION-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> +; FORCE_REDUCTION-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[SHUFFLE]], i32 1 +; FORCE_REDUCTION-NEXT: [[TMP3:%.*]] = add <8 x i32> [[SHUFFLE]], +; FORCE_REDUCTION-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]]) +; FORCE_REDUCTION-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP0:%.*]], [[TMP4]] +; FORCE_REDUCTION-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[TMP2]] +; FORCE_REDUCTION-NEXT: [[VAL_43:%.*]] = add i32 [[TMP2]], 14910 +; FORCE_REDUCTION-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[OP_RDX1]], i32 0 +; FORCE_REDUCTION-NEXT: [[TMP6]] = insertelement <2 x i32> [[TMP5]], i32 [[VAL_43]], i32 1 ; FORCE_REDUCTION-NEXT: br label [[LOOP]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll @@ -14,13 +14,11 @@ ; CHECK-NEXT: [[OR1:%.*]] = or i64 undef, undef ; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i64 undef, [[OR1]] ; CHECK-NEXT: [[ADD9:%.*]] = select i1 [[CMP3]], i32 65536, i32 65537 -; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef) -; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP0]], undef -; CHECK-NEXT: [[OP_RDX1:%.*]] = add i32 [[ADD0]], [[ADD2]] -; CHECK-NEXT: [[OP_RDX2:%.*]] = add i32 [[ADD4]], [[ADD9]] -; CHECK-NEXT: [[OP_RDX3:%.*]] = add i32 [[OP_RDX]], [[OP_RDX1]] -; CHECK-NEXT: [[OP_RDX4:%.*]] = add i32 [[OP_RDX3]], [[OP_RDX2]] -; CHECK-NEXT: ret i32 [[OP_RDX4]] +; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 undef, [[ADD0]] +; CHECK-NEXT: [[OP_RDX1:%.*]] = add i32 [[ADD2]], [[ADD4]] +; CHECK-NEXT: [[OP_RDX2:%.*]] = add i32 [[OP_RDX]], [[OP_RDX1]] +; CHECK-NEXT: [[OP_RDX3:%.*]] = add i32 [[OP_RDX2]], [[ADD9]] +; CHECK-NEXT: ret i32 [[OP_RDX3]] ; entry: %or0 = or i64 undef, undef diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll @@ -6,17 +6,19 @@ ; CHECK-NEXT: bb: ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> , i32 [[TMP7:%.*]], i32 2 ; CHECK-NEXT: [[TMP1:%.*]] = sub <4 x i32> [[TMP0]], zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 undef, i32 4 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = sub nsw <8 x i32> , [[SHUFFLE]] -; CHECK-NEXT: [[TMP5:%.*]] = add nsw <8 x i32> , [[SHUFFLE]] -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP5]], <8 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = add <8 x i32> zeroinitializer, [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i32> [[TMP7]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP8]]) -; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> zeroinitializer) -; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP24:%.*]] = sub i32 undef, 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> , i32 [[TMP24]], i32 4 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 0, i32 5 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[TMP24]], i32 4 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <8 x i32> [[TMP3]], [[SHUFFLE]] +; CHECK-NEXT: [[TMP7:%.*]] = add nsw <8 x i32> [[TMP3]], [[SHUFFLE]] +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> [[TMP7]], <8 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = add <8 x i32> zeroinitializer, [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i32> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP10]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP11]], 0 ; CHECK-NEXT: [[TMP64:%.*]] = zext i32 [[OP_RDX]] to i64 ; CHECK-NEXT: ret i64 [[TMP64]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/float-min-max.ll b/llvm/test/Transforms/SLPVectorizer/X86/float-min-max.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/float-min-max.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/float-min-max.ll @@ -17,12 +17,11 @@ ; CHECK-NEXT: [[UMIN3334:%.*]] = bitcast float* [[UMIN33]] to i8* ; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt i8* [[P3:%.*]], [[UMIN22]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[C:%.*]] -; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[C]] ; CHECK-NEXT: [[BOUND042:%.*]] = icmp ugt i8* [[P3]], [[UMIN3334]] ; CHECK-NEXT: [[FOUND_CONFLICT44:%.*]] = and i1 [[BOUND042]], [[C]] -; CHECK-NEXT: [[CONFLICT_RDX45:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT44]] -; CHECK-NEXT: [[CONFLICT_RDX49:%.*]] = or i1 [[CONFLICT_RDX45]], [[C]] -; CHECK-NEXT: ret i1 [[CONFLICT_RDX49]] +; CHECK-NEXT: [[OP_RDX:%.*]] = or i1 [[C]], [[FOUND_CONFLICT]] +; CHECK-NEXT: [[OP_RDX1:%.*]] = or i1 [[OP_RDX]], [[FOUND_CONFLICT44]] +; CHECK-NEXT: ret i1 [[OP_RDX1]] ; %scevgep21 = getelementptr float, float* %p1, i32 0 %l0 = icmp ult float* %p2, %scevgep21 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll b/llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/gather-extractelements-different-bbs.ll @@ -13,30 +13,25 @@ ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 3 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[TMP3]], i32 1 -; CHECK-NEXT: [[SHUFFLE15:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[SHUFFLE15]]) -; CHECK-NEXT: [[OP_RDX16:%.*]] = add i32 [[TMP6]], 0 -; CHECK-NEXT: [[OP_RDX17:%.*]] = add i32 [[OP_RDX16]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = mul <2 x i32> [[TMP5]], +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i32> [[TMP6]], i32 1 +; CHECK-NEXT: [[OP_RDX11:%.*]] = add i32 [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[OP_RDX12:%.*]] = add i32 [[OP_RDX11]], 0 ; CHECK-NEXT: br label [[BB3:%.*]] ; CHECK: bb2: ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: -; CHECK-NEXT: [[P1:%.*]] = phi i32 [ [[OP_RDX17]], [[BB1]] ], [ 0, [[BB2:%.*]] ] +; CHECK-NEXT: [[P1:%.*]] = phi i32 [ [[OP_RDX12]], [[BB1]] ], [ 0, [[BB2:%.*]] ] ; CHECK-NEXT: ret i32 0 ; CHECK: bb4: -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 0 -; CHECK-NEXT: [[SHUFFLE10:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i32> [[SHUFFLE]], [[SHUFFLE10]] -; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> zeroinitializer) -; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP8]]) -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i32> poison, i32 [[TMP10]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x i32> [[TMP11]], i32 [[TMP9]], i32 1 -; CHECK-NEXT: [[TMP13:%.*]] = add <2 x i32> [[TMP12]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i32> [[TMP13]], i32 0 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i32> [[TMP13]], i32 1 -; CHECK-NEXT: [[OP_RDX13:%.*]] = add i32 [[TMP14]], [[TMP15]] -; CHECK-NEXT: [[OP_RDX14:%.*]] = add i32 [[OP_RDX13]], [[TMP2]] -; CHECK-NEXT: ret i32 [[OP_RDX14]] +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 0 +; CHECK-NEXT: [[SHUFFLE8:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = add <4 x i32> [[SHUFFLE]], [[SHUFFLE8]] +; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP10]]) +; CHECK-NEXT: [[OP_RDX9:%.*]] = add i32 [[TMP11]], 0 +; CHECK-NEXT: [[OP_RDX10:%.*]] = add i32 [[OP_RDX9]], [[TMP2]] +; CHECK-NEXT: ret i32 [[OP_RDX10]] ; CHECK: bb5: ; CHECK-NEXT: br label [[BB4:%.*]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll @@ -16,12 +16,12 @@ ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16 ; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[SHUFFLE]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[CONV]] -; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONV]] -; CHECK-NEXT: store float [[OP_RDX1]], float* @res, align 4 -; CHECK-NEXT: ret float [[OP_RDX1]] +; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP4]], 2.000000e+00 +; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[CONV]], 2.000000e+00 +; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP5]], [[TMP6]] +; CHECK-NEXT: store float [[OP_RDX]], float* @res, align 4 +; CHECK-NEXT: ret float [[OP_RDX]] ; ; THRESHOLD-LABEL: @baz( ; THRESHOLD-NEXT: entry: @@ -31,12 +31,15 @@ ; THRESHOLD-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16 ; THRESHOLD-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16 ; THRESHOLD-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]] -; THRESHOLD-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> -; THRESHOLD-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[SHUFFLE]]) -; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[CONV]] -; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONV]] -; THRESHOLD-NEXT: store float [[OP_RDX1]], float* @res, align 4 -; THRESHOLD-NEXT: ret float [[OP_RDX1]] +; THRESHOLD-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP3]]) +; THRESHOLD-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP4]], i32 0 +; THRESHOLD-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[CONV]], i32 1 +; THRESHOLD-NEXT: [[TMP7:%.*]] = fmul fast <2 x float> [[TMP6]], +; THRESHOLD-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP7]], i32 0 +; THRESHOLD-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP7]], i32 1 +; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP8]], [[TMP9]] +; THRESHOLD-NEXT: store float [[OP_RDX]], float* @res, align 4 +; THRESHOLD-NEXT: ret float [[OP_RDX]] ; entry: %0 = load i32, i32* @n, align 4 @@ -755,10 +758,10 @@ ; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[X:%.*]] to <8 x float>* ; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP1]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[CONV]] -; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[CONV]], 3.000000e+00 -; CHECK-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX]], [[OP_RDX1]] -; CHECK-NEXT: ret float [[OP_RDX2]] +; CHECK-NEXT: [[TMP3:%.*]] = fmul fast float [[CONV]], 2.000000e+00 +; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], 3.000000e+00 +; CHECK-NEXT: ret float [[OP_RDX1]] ; ; THRESHOLD-LABEL: @extra_args( ; THRESHOLD-NEXT: entry: @@ -767,14 +770,10 @@ ; THRESHOLD-NEXT: [[TMP0:%.*]] = bitcast float* [[X:%.*]] to <8 x float>* ; THRESHOLD-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4 ; THRESHOLD-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP1]]) -; THRESHOLD-NEXT: [[TMP3:%.*]] = insertelement <2 x float> , float [[TMP2]], i32 0 -; THRESHOLD-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[CONV]], i32 0 -; THRESHOLD-NEXT: [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[CONV]], i32 1 -; THRESHOLD-NEXT: [[TMP6:%.*]] = fadd fast <2 x float> [[TMP3]], [[TMP5]] -; THRESHOLD-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP6]], i32 0 -; THRESHOLD-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP6]], i32 1 -; THRESHOLD-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[TMP7]], [[TMP8]] -; THRESHOLD-NEXT: ret float [[OP_RDX2]] +; THRESHOLD-NEXT: [[TMP3:%.*]] = fmul fast float [[CONV]], 2.000000e+00 +; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[TMP3]] +; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], 3.000000e+00 +; THRESHOLD-NEXT: ret float [[OP_RDX1]] ; entry: %mul = mul nsw i32 %b, %a @@ -815,11 +814,10 @@ ; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[X:%.*]] to <8 x float>* ; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP1]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], 5.000000e+00 -; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[CONV]], [[CONV]] -; CHECK-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX]], 8.000000e+00 -; CHECK-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], [[OP_RDX1]] -; CHECK-NEXT: ret float [[OP_RDX3]] +; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], 1.300000e+01 +; CHECK-NEXT: [[TMP3:%.*]] = fmul fast float [[CONV]], 2.000000e+00 +; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP3]] +; CHECK-NEXT: ret float [[OP_RDX1]] ; ; THRESHOLD-LABEL: @extra_args_same_several_times( ; THRESHOLD-NEXT: entry: @@ -828,15 +826,15 @@ ; THRESHOLD-NEXT: [[TMP0:%.*]] = bitcast float* [[X:%.*]] to <8 x float>* ; THRESHOLD-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4 ; THRESHOLD-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP1]]) -; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], 5.000000e+00 -; THRESHOLD-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[OP_RDX]], i32 0 +; THRESHOLD-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0 ; THRESHOLD-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[CONV]], i32 1 -; THRESHOLD-NEXT: [[TMP5:%.*]] = insertelement <2 x float> , float [[CONV]], i32 1 -; THRESHOLD-NEXT: [[TMP6:%.*]] = fadd fast <2 x float> [[TMP4]], [[TMP5]] -; THRESHOLD-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP6]], i32 0 -; THRESHOLD-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP6]], i32 1 -; THRESHOLD-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[TMP7]], [[TMP8]] -; THRESHOLD-NEXT: ret float [[OP_RDX3]] +; THRESHOLD-NEXT: [[TMP5:%.*]] = fadd fast <2 x float> [[TMP4]], +; THRESHOLD-NEXT: [[TMP6:%.*]] = fmul fast <2 x float> [[TMP4]], +; THRESHOLD-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> [[TMP6]], <2 x i32> +; THRESHOLD-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP7]], i32 0 +; THRESHOLD-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP7]], i32 1 +; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[TMP8]], [[TMP9]] +; THRESHOLD-NEXT: ret float [[OP_RDX1]] ; entry: %mul = mul nsw i32 %b, %a @@ -880,11 +878,11 @@ ; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[X:%.*]] to <8 x float>* ; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP1]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[CONV]] -; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[CONV]], [[CONVC]] -; CHECK-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX]], [[OP_RDX1]] -; CHECK-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], 3.000000e+00 -; CHECK-NEXT: ret float [[OP_RDX3]] +; CHECK-NEXT: [[TMP3:%.*]] = fmul fast float [[CONV]], 2.000000e+00 +; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONVC]] +; CHECK-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], 3.000000e+00 +; CHECK-NEXT: ret float [[OP_RDX2]] ; ; THRESHOLD-LABEL: @extra_args_no_replace( ; THRESHOLD-NEXT: entry: @@ -894,16 +892,11 @@ ; THRESHOLD-NEXT: [[TMP0:%.*]] = bitcast float* [[X:%.*]] to <8 x float>* ; THRESHOLD-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4 ; THRESHOLD-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP1]]) -; THRESHOLD-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0 -; THRESHOLD-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[CONVC]], i32 1 -; THRESHOLD-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[CONV]], i32 0 -; THRESHOLD-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[CONV]], i32 1 -; THRESHOLD-NEXT: [[TMP7:%.*]] = fadd fast <2 x float> [[TMP4]], [[TMP6]] -; THRESHOLD-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP7]], i32 0 -; THRESHOLD-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP7]], i32 1 -; THRESHOLD-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[TMP8]], [[TMP9]] -; THRESHOLD-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], 3.000000e+00 -; THRESHOLD-NEXT: ret float [[OP_RDX3]] +; THRESHOLD-NEXT: [[TMP3:%.*]] = fmul fast float [[CONV]], 2.000000e+00 +; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[TMP3]] +; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONVC]] +; THRESHOLD-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], 3.000000e+00 +; THRESHOLD-NEXT: ret float [[OP_RDX2]] ; entry: %mul = mul nsw i32 %b, %a diff --git a/llvm/test/Transforms/SLPVectorizer/X86/malformed_phis.ll b/llvm/test/Transforms/SLPVectorizer/X86/malformed_phis.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/malformed_phis.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/malformed_phis.ll @@ -51,13 +51,9 @@ ; CHECK: bb2: ; CHECK-NEXT: [[TMP:%.*]] = phi i32 [ undef, [[BB:%.*]] ], [ undef, [[BB2]] ] ; CHECK-NEXT: [[TMP3:%.*]] = phi i32 [ 0, [[BB]] ], [ undef, [[BB2]] ] -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> poison, i32 [[TMP]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef) -; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[SHUFFLE]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[OP_RDX1:%.*]] = add i32 [[OP_RDX]], undef -; CHECK-NEXT: call void @use(i32 [[OP_RDX1]]) +; CHECK-NEXT: [[TMP0:%.*]] = mul i32 [[TMP]], 8 +; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 undef, [[TMP0]] +; CHECK-NEXT: call void @use(i32 [[OP_RDX]]) ; CHECK-NEXT: br label [[BB2]] ; bb: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll @@ -15,8 +15,8 @@ ; CHECK-NEXT: [[TMP1:%.*]] = sext <2 x i16> [[TMP0]] to <2 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = sub nsw <2 x i32> , [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = sub <2 x i32> [[TMP2]], undef -; CHECK-NEXT: [[SHUFFLE4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[SHUFFLE4]], +; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[SHUFFLE2]], ; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP4]]) ; CHECK-NEXT: [[T19:%.*]] = select i1 undef, i32 [[TMP5]], i32 undef ; CHECK-NEXT: [[T20:%.*]] = icmp sgt i32 [[T19]], 63 @@ -24,13 +24,10 @@ ; CHECK-NEXT: [[TMP7:%.*]] = sub <2 x i32> [[TMP6]], undef ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[SHUFFLE]], -; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef) -; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP8]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = icmp slt i32 [[TMP9]], [[TMP10]] -; CHECK-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP9]], i32 [[TMP10]] -; CHECK-NEXT: [[OP_RDX2:%.*]] = icmp slt i32 [[OP_RDX1]], undef -; CHECK-NEXT: [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[OP_RDX1]], i32 undef -; CHECK-NEXT: [[T45:%.*]] = icmp sgt i32 undef, [[OP_RDX3]] +; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP8]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = icmp slt i32 undef, [[TMP9]] +; CHECK-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 undef, i32 [[TMP9]] +; CHECK-NEXT: [[T45:%.*]] = icmp sgt i32 undef, [[OP_RDX1]] ; CHECK-NEXT: unreachable ; bb: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/revectorized_rdx_crash.ll b/llvm/test/Transforms/SLPVectorizer/X86/revectorized_rdx_crash.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/revectorized_rdx_crash.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/revectorized_rdx_crash.ll @@ -22,22 +22,18 @@ ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[I]] to <4 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 8 ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]]) -; CHECK-NEXT: [[OP_RDX7:%.*]] = add i32 [[TMP2]], undef -; CHECK-NEXT: [[OP_RDX8:%.*]] = add i32 [[OP_RDX7]], undef +; CHECK-NEXT: [[OP_RDX3:%.*]] = add i32 [[TMP2]], undef ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[I1]] to <4 x i32>* ; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]]) -; CHECK-NEXT: [[OP_RDX5:%.*]] = add i32 [[TMP5]], undef -; CHECK-NEXT: [[OP_RDX6:%.*]] = add i32 [[OP_RDX5]], undef -; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef) -; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP6]], undef -; CHECK-NEXT: [[OP_RDX1:%.*]] = add i32 [[OP_RDX8]], [[OP_RDX8]] -; CHECK-NEXT: [[OP_RDX2:%.*]] = add i32 [[OP_RDX6]], [[OP_RDX6]] -; CHECK-NEXT: [[OP_RDX3:%.*]] = add i32 [[OP_RDX]], [[OP_RDX1]] -; CHECK-NEXT: [[OP_RDX4:%.*]] = add i32 [[OP_RDX3]], [[OP_RDX2]] +; CHECK-NEXT: [[OP_RDX2:%.*]] = add i32 [[TMP5]], undef +; CHECK-NEXT: [[TMP6:%.*]] = mul i32 [[OP_RDX3]], 2 +; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 undef, [[TMP6]] +; CHECK-NEXT: [[TMP7:%.*]] = mul i32 [[OP_RDX2]], 2 +; CHECK-NEXT: [[OP_RDX1:%.*]] = add i32 [[OP_RDX]], [[TMP7]] ; CHECK-NEXT: br label [[IF_END]] ; CHECK: if.end: -; CHECK-NEXT: [[R:%.*]] = phi i32 [ [[OP_RDX4]], [[FOR_COND_PREHEADER]] ], [ undef, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[R:%.*]] = phi i32 [ [[OP_RDX1]], [[FOR_COND_PREHEADER]] ], [ undef, [[ENTRY:%.*]] ] ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/undef_vect.ll b/llvm/test/Transforms/SLPVectorizer/X86/undef_vect.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/undef_vect.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/undef_vect.ll @@ -12,9 +12,7 @@ ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> [[TMP1]]) ; CHECK-NEXT: [[OP_RDX:%.*]] = icmp sgt i32 [[TMP2]], undef ; CHECK-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP2]], i32 undef -; CHECK-NEXT: [[OP_RDX2:%.*]] = icmp sgt i32 [[OP_RDX1]], undef -; CHECK-NEXT: [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[OP_RDX1]], i32 undef -; CHECK-NEXT: [[DOTSROA_SPECULATED_9:%.*]] = select i1 undef, i32 undef, i32 [[OP_RDX3]] +; CHECK-NEXT: [[DOTSROA_SPECULATED_9:%.*]] = select i1 undef, i32 undef, i32 [[OP_RDX1]] ; CHECK-NEXT: [[CMP_I1_10:%.*]] = icmp slt i32 [[DOTSROA_SPECULATED_9]], undef ; CHECK-NEXT: ret void ;