diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h --- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -293,9 +293,6 @@ /// Return the fixed-order recurrences found in the loop. RecurrenceSet &getFixedOrderRecurrences() { return FixedOrderRecurrences; } - /// Return the set of instructions to sink to handle fixed-order recurrences. - MapVector &getSinkAfter() { return SinkAfter; } - /// Returns the widest induction type. Type *getWidestInductionType() { return WidestIndTy; } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -350,9 +350,9 @@ /// Build a VPlan using VPRecipes according to the information gather by /// Legal. This method is only used for the legacy inner loop vectorizer. - VPlanPtr buildVPlanWithVPRecipes( - VFRange &Range, SmallPtrSetImpl &DeadInstructions, - const MapVector &SinkAfter); + VPlanPtr + buildVPlanWithVPRecipes(VFRange &Range, + SmallPtrSetImpl &DeadInstructions); /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, /// according to the information gathered by Legal when it checked if it is diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8677,34 +8677,10 @@ auto &ConditionalAssumes = Legal->getConditionalAssumes(); DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); - MapVector &SinkAfter = Legal->getSinkAfter(); - // Dead instructions do not need sinking. Remove them from SinkAfter. - for (Instruction *I : DeadInstructions) - SinkAfter.erase(I); - - // Cannot sink instructions after dead instructions (there won't be any - // recipes for them). Instead, find the first non-dead previous instruction. - for (auto &P : Legal->getSinkAfter()) { - Instruction *SinkTarget = P.second; - Instruction *FirstInst = &*SinkTarget->getParent()->begin(); - (void)FirstInst; - while (DeadInstructions.contains(SinkTarget)) { - assert( - SinkTarget != FirstInst && - "Must find a live instruction (at least the one feeding the " - "fixed-order recurrence PHI) before reaching beginning of the block"); - SinkTarget = SinkTarget->getPrevNode(); - assert(SinkTarget != P.first && - "sink source equals target, no sinking required"); - } - P.second = SinkTarget; - } - auto MaxVFPlusOne = MaxVF.getWithIncrement(1); for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { VFRange SubRange = {VF, MaxVFPlusOne}; - VPlans.push_back( - buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); + VPlans.push_back(buildVPlanWithVPRecipes(SubRange, DeadInstructions)); VF = SubRange.End; } } @@ -8810,8 +8786,7 @@ } VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( - VFRange &Range, SmallPtrSetImpl &DeadInstructions, - const MapVector &SinkAfter) { + VFRange &Range, SmallPtrSetImpl &DeadInstructions) { SmallPtrSet *, 1> InterleaveGroups; @@ -8822,12 +8797,6 @@ // process after constructing the initial VPlan. // --------------------------------------------------------------------------- - // Mark instructions we'll need to sink later and their targets as - // ingredients whose recipe we'll need to record. - for (const auto &Entry : SinkAfter) { - RecipeBuilder.recordRecipeOf(Entry.first); - RecipeBuilder.recordRecipeOf(Entry.second); - } for (const auto &Reduction : CM.getInLoopReductionChains()) { PHINode *Phi = Reduction.first; RecurKind Kind = @@ -8896,7 +8865,6 @@ DFS.perform(LI); VPBasicBlock *VPBB = HeaderVPBB; - SmallVector InductionsToMove; for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { // Relevant instructions from basic block BB will be grouped into VPRecipe // ingredients and fill a new VPBasicBlock. @@ -8951,6 +8919,7 @@ Plan->addVPValue(UV, Def); } + RecipeBuilder.setRecipe(Instr, Recipe); if (isa(Recipe) && HeaderVPBB->getFirstNonPhi() != VPBB->end()) { // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section @@ -8959,11 +8928,9 @@ // block after applying SinkAfter, which relies on the original // position of the trunc. assert(isa(Instr)); - InductionsToMove.push_back( - cast(Recipe)); - } - RecipeBuilder.setRecipe(Instr, Recipe); - VPBB->appendRecipe(Recipe); + Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); + } else + VPBB->appendRecipe(Recipe); continue; } @@ -8998,115 +8965,14 @@ // bring the VPlan to its final state. // --------------------------------------------------------------------------- - // Apply Sink-After legal constraints. - auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * { - auto *Region = dyn_cast_or_null(R->getParent()->getParent()); - if (Region && Region->isReplicator()) { - assert(Region->getNumSuccessors() == 1 && - Region->getNumPredecessors() == 1 && "Expected SESE region!"); - assert(R->getParent()->size() == 1 && - "A recipe in an original replicator region must be the only " - "recipe in its block"); - return Region; - } - return nullptr; - }; - for (const auto &Entry : SinkAfter) { - VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); - VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); - - auto *TargetRegion = GetReplicateRegion(Target); - auto *SinkRegion = GetReplicateRegion(Sink); - if (!SinkRegion) { - // If the sink source is not a replicate region, sink the recipe directly. - if (TargetRegion) { - // The target is in a replication region, make sure to move Sink to - // the block after it, not into the replication region itself. - VPBasicBlock *NextBlock = - cast(TargetRegion->getSuccessors().front()); - Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); - } else - Sink->moveAfter(Target); - continue; - } - - // The sink source is in a replicate region. Unhook the region from the CFG. - auto *SinkPred = SinkRegion->getSinglePredecessor(); - auto *SinkSucc = SinkRegion->getSingleSuccessor(); - VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion); - VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc); - VPBlockUtils::connectBlocks(SinkPred, SinkSucc); - - if (TargetRegion) { - // The target recipe is also in a replicate region, move the sink region - // after the target region. - auto *TargetSucc = TargetRegion->getSingleSuccessor(); - VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc); - VPBlockUtils::connectBlocks(TargetRegion, SinkRegion); - VPBlockUtils::connectBlocks(SinkRegion, TargetSucc); - } else { - // The sink source is in a replicate region, we need to move the whole - // replicate region, which should only contain a single recipe in the - // main block. - auto *SplitBlock = - Target->getParent()->splitAt(std::next(Target->getIterator())); - - auto *SplitPred = SplitBlock->getSinglePredecessor(); - - VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); - VPBlockUtils::connectBlocks(SplitPred, SinkRegion); - VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); - } - } - VPlanTransforms::removeRedundantCanonicalIVs(*Plan); VPlanTransforms::removeRedundantInductionCasts(*Plan); - // Now that sink-after is done, move induction recipes for optimized truncates - // to the phi section of the header block. - for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove) - Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); - // Adjust the recipes for any inloop reductions. adjustRecipesForReductions(cast(TopRegion->getExiting()), Plan, RecipeBuilder, Range.Start); - // Introduce a recipe to combine the incoming and previous values of a - // fixed-order recurrence. - for (VPRecipeBase &R : - Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { - auto *RecurPhi = dyn_cast(&R); - if (!RecurPhi) - continue; - - VPRecipeBase *PrevRecipe = &RecurPhi->getBackedgeRecipe(); - // Fixed-order recurrences do not contain cycles, so this loop is guaranteed - // to terminate. - while (auto *PrevPhi = - dyn_cast(PrevRecipe)) - PrevRecipe = &PrevPhi->getBackedgeRecipe(); - VPBasicBlock *InsertBlock = PrevRecipe->getParent(); - auto *Region = GetReplicateRegion(PrevRecipe); - if (Region) - InsertBlock = dyn_cast(Region->getSingleSuccessor()); - if (!InsertBlock) { - InsertBlock = new VPBasicBlock(Region->getName() + ".succ"); - VPBlockUtils::insertBlockAfter(InsertBlock, Region); - } - if (Region || PrevRecipe->isPhi()) - Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi()); - else - Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator())); - - auto *RecurSplice = cast( - Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice, - {RecurPhi, RecurPhi->getBackedgeValue()})); - - RecurPhi->replaceAllUsesWith(RecurSplice); - // Set the first operand of RecurSplice to RecurPhi again, after replacing - // all users. - RecurSplice->setOperand(0, RecurPhi); - } + VPlanTransforms::fixFirstOrderRecurrences(*Plan, Builder); // Interleave memory: for each Interleave Group we marked earlier as relevant // for this VPlan, replace the Recipes widening its memory instructions with a diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1259,6 +1259,8 @@ void execute(VPTransformState &State) override; + VPValue *getPreviousValue() { return getOperand(1); } + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -24,6 +24,7 @@ class ScalarEvolution; class Loop; class PredicatedScalarEvolution; +class VPBuilder; class TargetLibraryInfo; struct VPlanTransforms { @@ -76,6 +77,8 @@ static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE); + + static void fixFirstOrderRecurrences(VPlan &Plan, VPBuilder &Builder); }; } // namespace llvm diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -12,7 +12,9 @@ //===----------------------------------------------------------------------===// #include "VPlanTransforms.h" +#include "VPRecipeBuilder.h" #include "VPlanCFG.h" +#include "VPlanDominatorTree.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SetVector.h" #include "llvm/Analysis/IVDescriptors.h" @@ -533,3 +535,187 @@ // 1. Replace inductions with constants. // 2. Replace vector loop region with VPBasicBlock. } + +static VPRegionBlock *GetReplicateRegion(VPRecipeBase *R) { + auto *Region = dyn_cast_or_null(R->getParent()->getParent()); + if (Region && Region->isReplicator()) { + assert(Region->getNumSuccessors() == 1 && + Region->getNumPredecessors() == 1 && "Expected SESE region!"); + assert(R->getParent()->size() == 1 && + "A recipe in an original replicator region must be the only " + "recipe in its block"); + return Region; + } + return nullptr; +} + +static bool recipeComesBefore(const VPRecipeBase *A, const VPRecipeBase *B, + VPDominatorTree &VPDT) { + auto LocalComesBefore = [](const VPRecipeBase *A, const VPRecipeBase *B) { + for (auto &R : *A->getParent()) { + if (&R == A) + return true; + if (&R == B) + return false; + } + llvm_unreachable("recipe not found"); + }; + if (A->getParent() == B->getParent()) + return LocalComesBefore(A, B); + return VPDT.dominates(A->getParent(), B->getParent()); +} + +// Sink users of \p FOR after the recipe defining the previous value \p Previous +// of the recurrence. +static void +sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR, + VPRecipeBase *Previous, + VPDominatorTree &VPDT) { + auto CompareByComesBefore = [&](const VPRecipeBase *A, + const VPRecipeBase *B) { + auto *RegionA = GetReplicateRegion(const_cast(A)); + auto *RegionB = GetReplicateRegion(const_cast(B)); + if (RegionA && RegionB) + return VPDT.dominates(RegionA, RegionB); + + return recipeComesBefore(A, B, VPDT); + }; + + // Collect recipes that need sinking. + SmallVector WorkList; + SmallPtrSet Seen; + Seen.insert(Previous); + // Keep recipes to sink ordered by dominance so earlier instructions are + // processed first. + std::set InstrsToSink( + CompareByComesBefore); + auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) { + if (isa(SinkCandidate) || + !Seen.insert(SinkCandidate).second || + recipeComesBefore(Previous, SinkCandidate, VPDT)) + return; + + WorkList.push_back(SinkCandidate); + + // VPPredInstPHIRecipes don't need sinking, because they will be sunk when + // sinking the containing replicate region. + if (!isa(SinkCandidate)) + InstrsToSink.insert(SinkCandidate); + }; + + // Recursively sink users of FOR after Previous. + WorkList.push_back(FOR); + for (unsigned I = 0; I != WorkList.size(); ++I) { + VPRecipeBase *Current = WorkList[I]; + for (VPUser *User : Current->getVPSingleValue()->users()) { + if (auto *R = dyn_cast(User)) + TryToPushSinkCandidate(R); + } + } + for (VPRecipeBase *SinkCandidate : InstrsToSink) { + VPRecipeBase *Target = Previous; + for (VPValue *Op : SinkCandidate->operands()) { + VPRecipeBase *Curr = Op->getDefiningRecipe(); + if (!Curr) + continue; + if (recipeComesBefore(Target, Curr, VPDT)) + Target = Curr; + } + + auto *TargetRegion = GetReplicateRegion(Target); + auto *SinkRegion = GetReplicateRegion(SinkCandidate); + if (!SinkRegion) { + // If the sink source is not a replicate region, sink the recipe + // directly. + if (TargetRegion) { + // The target is in a replication region, make sure to move Sink to + // the block after it, not into the replication region itself. + VPBasicBlock *NextBlock = + cast(TargetRegion->getSuccessors().front()); + SinkCandidate->moveBefore(*NextBlock, NextBlock->getFirstNonPhi()); + } else + SinkCandidate->moveAfter(Target); + continue; + } + // The sink source is in a replicate region. Unhook the region from the + // CFG. + auto *SinkPred = SinkRegion->getSinglePredecessor(); + auto *SinkSucc = SinkRegion->getSingleSuccessor(); + VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion); + VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc); + VPBlockUtils::connectBlocks(SinkPred, SinkSucc); + + if (TargetRegion) { + // The target recipe is also in a replicate region, move the sink + // region after the target region. + auto *TargetSucc = TargetRegion->getSingleSuccessor(); + VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc); + VPBlockUtils::connectBlocks(TargetRegion, SinkRegion); + VPBlockUtils::connectBlocks(SinkRegion, TargetSucc); + } else { + // The sink source is in a replicate region, we need to move the whole + // replicate region, which should only contain a single recipe in the + // main block. + auto *SplitBlock = + Target->getParent()->splitAt(std::next(Target->getIterator())); + + auto *SplitPred = SplitBlock->getSinglePredecessor(); + + VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); + VPBlockUtils::connectBlocks(SplitPred, SinkRegion); + VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); + } + // We modified the CFG, update dominator tree. + VPDT.recalculate(*SinkRegion->getPlan()); + } +} + +void VPlanTransforms::fixFirstOrderRecurrences(VPlan &Plan, + VPBuilder &Builder) { + VPDominatorTree VPDT; + VPDT.recalculate(Plan); + + for (VPRecipeBase &R : + Plan.getVectorLoopRegion()->getEntry()->getEntryBasicBlock()->phis()) { + auto *FOR = dyn_cast(&R); + if (!FOR) + continue; + + SmallPtrSet SeenPhis; + VPRecipeBase *Previous = FOR->getPreviousValue()->getDefiningRecipe(); + // Fixed-order recurrences do not contain cycles, so this loop is guaranteed + // to terminate. + while (auto *PrevPhi = + dyn_cast_or_null(Previous)) { + assert(PrevPhi->getParent() == FOR->getParent()); + assert(SeenPhis.insert(PrevPhi).second); + Previous = PrevPhi->getPreviousValue()->getDefiningRecipe(); + } + + sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT); + + // Introduce a recipe to combine the incoming and previous values of a + // fixed-order recurrence. + VPBasicBlock *InsertBlock = Previous->getParent(); + auto *Region = GetReplicateRegion(Previous); + if (Region) + InsertBlock = dyn_cast(Region->getSingleSuccessor()); + if (!InsertBlock) { + InsertBlock = new VPBasicBlock(Region->getName() + ".succ"); + VPBlockUtils::insertBlockAfter(InsertBlock, Region); + } + if (Region || Previous->isPhi()) + Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi()); + else + Builder.setInsertPoint(InsertBlock, std::next(Previous->getIterator())); + + auto *RecurSplice = cast( + Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice, + {FOR, FOR->getBackedgeValue()})); + + FOR->replaceAllUsesWith(RecurSplice); + // Set the first operand of RecurSplice to FOR again, after replacing + // all users. + RecurSplice->setOperand(0, FOR); + } +} diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll @@ -57,7 +57,7 @@ ; CHECK-NEXT: store i32 [[ADD_2]], ptr [[IDX_2]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 2000 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[FOR]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[FOR]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -460,18 +460,18 @@ ; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT3]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT2]], <4 x float> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[VECTOR_RECUR]], <4 x float> [[BROADCAST_SPLAT3]], <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = fmul fast <4 x float> [[TMP4]], +; CHECK-NEXT: [[TMP5:%.*]] = fadd fast <4 x float> [[TMP4]], ; CHECK-NEXT: [[TMP6:%.*]] = fmul fast <4 x float> [[TMP4]], [[TMP2]] -; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <4 x float> [[TMP4]], +; CHECK-NEXT: [[TMP7:%.*]] = fmul fast <4 x float> [[TMP4]], ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[DST_1:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0 ; CHECK-NEXT: store <4 x float> [[TMP6]], ptr [[TMP9]], align 4 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[DST_2:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i32 0 -; CHECK-NEXT: store <4 x float> [[TMP5]], ptr [[TMP11]], align 4 +; CHECK-NEXT: store <4 x float> [[TMP7]], ptr [[TMP11]], align 4 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[DST_3:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0 -; CHECK-NEXT: store <4 x float> [[TMP7]], ptr [[TMP13]], align 4 +; CHECK-NEXT: store <4 x float> [[TMP5]], ptr [[TMP13]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 ; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] @@ -555,19 +555,19 @@ ; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT3]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT2]], <4 x float> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[VECTOR_RECUR]], <4 x float> [[BROADCAST_SPLAT3]], <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = fmul fast <4 x float> [[TMP4]], -; CHECK-NEXT: [[TMP6:%.*]] = fmul fast <4 x float> [[TMP5]], -; CHECK-NEXT: [[TMP7:%.*]] = fmul fast <4 x float> [[TMP6]], [[TMP2]] -; CHECK-NEXT: [[TMP8:%.*]] = fadd fast <4 x float> [[TMP4]], +; CHECK-NEXT: [[TMP5:%.*]] = fadd fast <4 x float> [[TMP4]], +; CHECK-NEXT: [[TMP6:%.*]] = fmul fast <4 x float> [[TMP4]], +; CHECK-NEXT: [[TMP7:%.*]] = fmul fast <4 x float> [[TMP6]], +; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <4 x float> [[TMP7]], [[TMP2]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[DST_1:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 0 -; CHECK-NEXT: store <4 x float> [[TMP7]], ptr [[TMP10]], align 4 +; CHECK-NEXT: store <4 x float> [[TMP8]], ptr [[TMP10]], align 4 ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[DST_2:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0 -; CHECK-NEXT: store <4 x float> [[TMP5]], ptr [[TMP12]], align 4 +; CHECK-NEXT: store <4 x float> [[TMP6]], ptr [[TMP12]], align 4 ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[DST_3:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i32 0 -; CHECK-NEXT: store <4 x float> [[TMP8]], ptr [[TMP14]], align 4 +; CHECK-NEXT: store <4 x float> [[TMP5]], ptr [[TMP14]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 ; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] @@ -950,10 +950,10 @@ ; CHECK-NEXT: [[TMP5]] = zext <4 x i32> [[WIDE_LOAD]] to <4 x i64> ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> [[TMP5]], <4 x i32> ; CHECK-NEXT: [[TMP7:%.*]] = trunc <4 x i64> [[TMP6]] to <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i32> [[TMP7]], -; CHECK-NEXT: [[TMP9:%.*]] = mul <4 x i32> [[TMP8]], -; CHECK-NEXT: [[TMP10:%.*]] = icmp slt <4 x i32> [[TMP7]], -; CHECK-NEXT: [[TMP11:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> [[TMP7]], <4 x i32> [[TMP9]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp slt <4 x i32> [[TMP7]], +; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP7]], +; CHECK-NEXT: [[TMP10:%.*]] = mul <4 x i32> [[TMP9]], +; CHECK-NEXT: [[TMP11:%.*]] = select <4 x i1> [[TMP8]], <4 x i32> [[TMP7]], <4 x i32> [[TMP10]] ; CHECK-NEXT: store <4 x i32> [[TMP11]], ptr [[TMP4]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] @@ -1074,9 +1074,9 @@ ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD]] = load <4 x double>, ptr [[TMP2]], align 8 ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[VECTOR_RECUR1]], <4 x double> [[WIDE_LOAD]], <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[VECTOR_RECUR]], <4 x double> [[WIDE_LOAD]], <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = fadd <4 x double> , [[TMP3]] -; CHECK-NEXT: [[TMP6:%.*]] = fadd <4 x double> [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[TMP4:%.*]] = fadd <4 x double> , [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[VECTOR_RECUR]], <4 x double> [[WIDE_LOAD]], <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = fadd <4 x double> [[TMP4]], [[TMP5]] ; CHECK-NEXT: store <4 x double> [[TMP6]], ptr [[TMP2]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 996