Index: llvm/include/llvm/Analysis/IVDescriptors.h =================================================================== --- llvm/include/llvm/Analysis/IVDescriptors.h +++ llvm/include/llvm/Analysis/IVDescriptors.h @@ -220,6 +220,11 @@ /// Returns true if all source operands of the recurrence are SExtInsts. bool isSigned() { return IsSigned; } + /// Attempts to find a chain of operations from Phi to LoopExitInst that can + /// be treated as a set of reductions instructions for in-loop reductions. + SmallVector getReductionOpChain(PHINode *Phi, + Loop *L) const; + private: // The starting value of the recurrence. // It does not have to be zero! Index: llvm/lib/Analysis/IVDescriptors.cpp =================================================================== --- llvm/lib/Analysis/IVDescriptors.cpp +++ llvm/lib/Analysis/IVDescriptors.cpp @@ -796,6 +796,58 @@ } } +static bool findPathToPhi(Instruction *Cur, + SmallVectorImpl &ReductionOperations, + unsigned Opcode, PHINode *Phi, Loop *L) { + assert(Cur->getOpcode() == Opcode); + assert(Cur->getNumOperands() == 2); + + if (Cur->getOperand(0) == Phi || Cur->getOperand(1) == Phi) { + ReductionOperations.push_back(Cur); + return true; + } + + if (auto *LHS = dyn_cast(Cur->getOperand(0))) { + if (LHS->getOpcode() == Opcode && L->contains(LHS->getParent()) && + LHS->hasOneUse() && + findPathToPhi(LHS, ReductionOperations, Opcode, Phi, L)) { + ReductionOperations.push_back(Cur); + return true; + } + } + if (auto *RHS = dyn_cast(Cur->getOperand(1))) { + if (RHS->getOpcode() == Opcode && L->contains(RHS->getParent()) && + RHS->hasOneUse() && + findPathToPhi(RHS, ReductionOperations, Opcode, Phi, L)) { + ReductionOperations.push_back(Cur); + return true; + } + } + + return false; +} + +SmallVector +RecurrenceDescriptor::getReductionOpChain(PHINode *Phi, Loop *L) const { + SmallVector ReductionOperations; + unsigned RedOp = getRecurrenceBinOp(Kind); + + // DFS up through the chain of operations in the loop, starting from + // LoopExitInstr and searching for Phi. + + // Note that we check that the type of the operand is correct for each item in + // the chain, including the first. This can come up from sub, which would + // otherwise be treated as an add reduction. + // FIXME: We also do not attempt to look through Shuffles/Phi's yet, which + // might be part of the reduction chain. + // FIXME: Doesn't work for min/max yet, which need to look for the appropriate + // cmp/select pair. + if (LoopExitInstr->getOpcode() == RedOp && + findPathToPhi(LoopExitInstr, ReductionOperations, RedOp, Phi, L)) + return ReductionOperations; + return {}; +} + InductionDescriptor::InductionDescriptor(Value *Start, InductionKind K, const SCEV *Step, BinaryOperator *BOp, SmallVectorImpl *Casts) Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -265,6 +265,11 @@ cl::desc("The maximum interleave count to use when interleaving a scalar " "reduction in a nested loop.")); +static cl::opt + ForceInloopReductions("force-inloop-reductions", cl::init(false), + cl::Hidden, + cl::desc("Force in-loop vector reductions.")); + cl::opt EnableVPlanNativePath( "enable-vplan-native-path", cl::init(false), cl::Hidden, cl::desc("Enable VPlan-native vectorization path with " @@ -1031,6 +1036,10 @@ /// Collect values we want to ignore in the cost model. void collectValuesToIgnore(); + /// Split reductions into those that happen in the loop, and those that happen + /// outside. + void categorizeReductions(); + /// \returns The smallest bitwidth each instruction can be represented with. /// The vector equivalents of these instructions should be truncated to this /// type. @@ -1298,6 +1307,19 @@ return foldTailByMasking() || Legal->blockNeedsPredication(BB); } + /// Returns true if the Phi is part of an inloop reduction. + bool useInloopReductions(PHINode *Phi) const { + return InloopReductions.count(Phi); + } + + /// Returns true if there are any inloop reductions. + bool hasInloopReductions() const { return InloopReductions.size() != 0; } + + /// Returns true if there are any outside-loop reductions. + bool hasOuterloopReductions() const { + return InloopReductions.size() < Legal->getReductionVars().size(); + } + /// Estimate cost of an intrinsic call instruction CI if it were vectorized /// with factor VF. Return the cost of the instruction, including /// scalarization overhead if it's needed. @@ -1425,6 +1447,9 @@ /// scalarized. DenseMap> ForcedScalars; + /// PHINodes of the reductions that should be expanded in-loop. + SmallPtrSet InloopReductions; + /// Returns the expected difference in cost from scalarizing the expression /// feeding a predicated instruction \p PredInst. The instructions to /// scalarize and their scalar costs are collected in \p ScalarCosts. A @@ -3723,6 +3748,7 @@ RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = RdxDesc.getMinMaxRecurrenceKind(); setDebugLocFromInst(Builder, ReductionStartValue); + bool UseInloopReductions = Cost->useInloopReductions(Phi); // We need to generate a reduction vector from the incoming scalar. // To do so, we need to generate the 'identity' vector and override @@ -3740,7 +3766,7 @@ if (RK == RecurrenceDescriptor::RK_IntegerMinMax || RK == RecurrenceDescriptor::RK_FloatMinMax) { // MinMax reduction have the start value as their identify. - if (VF == 1) { + if (VF == 1 || UseInloopReductions) { VectorStart = Identity = ReductionStartValue; } else { VectorStart = Identity = @@ -3750,7 +3776,7 @@ // Handle other reduction kinds: Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity( RK, VecTy->getScalarType()); - if (VF == 1) { + if (VF == 1 || UseInloopReductions) { Identity = Iden; // This vector is the Identity vector where the first element is the // incoming scalar reduction. @@ -3817,7 +3843,8 @@ // If the vector reduction can be performed in a smaller type, we truncate // then extend the loop exit value to enable InstCombine to evaluate the // entire expression in the smaller type. - if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) { + if ((VF > 1 && !UseInloopReductions) && + Phi->getType() != RdxDesc.getRecurrenceType()) { Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); Builder.SetInsertPoint( LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); @@ -3868,7 +3895,7 @@ RdxPart); } - if (VF > 1) { + if (VF > 1 && !UseInloopReductions) { bool NoNaN = Legal->hasFunNoNaNAttr(); ReducedPartRdx = createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN); @@ -4163,6 +4190,8 @@ if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) { for (unsigned Part = 0; Part < UF; ++Part) { // This is phase one of vectorizing PHIs. + if (Cost->useInloopReductions(cast(PN))) + VF = 1; // Inloop reductions retain a scalar PHI chain. Type *VecTy = (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); Value *EntryPart = PHINode::Create( @@ -6492,6 +6521,35 @@ } } +void LoopVectorizationCostModel::categorizeReductions() { + // For the moment, without predicated reduction instructions, we do not + // support inloop reductions whilst Folding the tail, and hence in those cases + // all reductions are currently outside the loop. + if (foldTailByMasking()) + return; + + for (auto &Reduction : Legal->getReductionVars()) { + PHINode *Phi = Reduction.first; + RecurrenceDescriptor &RdxDesc = Reduction.second; + + // If the target would prefer this reduction to happen "in-loop", then we + // want to record it as such. + if (!ForceInloopReductions) + continue; + + // Check that we can correctly put the reductions into the loop, by + // finding the chain of operations that leads from the loop exit value back + // to the phi. + SmallVector ReductionOperations = + RdxDesc.getReductionOpChain(Phi, TheLoop); + if (!ReductionOperations.empty()) { + LLVM_DEBUG(dbgs() << "LV: Using inloop reduction for phi: " << *Phi << "\n"); + InloopReductions.insert(Phi); + } else + LLVM_DEBUG(dbgs() << "LV: Using outerloop reduction for phi: " << *Phi << "\n"); + } +} + // TODO: we could return a pair of values that specify the max VF and // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment @@ -6570,6 +6628,7 @@ // Collect the instructions (and their associated costs) that will be more // profitable to scalarize. CM.selectUserVectorizationFactor(UserVF); + CM.categorizeReductions(); buildVPlansWithVPRecipes(UserVF, UserVF); LLVM_DEBUG(printPlans(dbgs())); return {{UserVF, 0}}; @@ -6588,6 +6647,8 @@ CM.collectInstsToScalarize(VF); } + CM.categorizeReductions(); + buildVPlansWithVPRecipes(1, MaxVF); LLVM_DEBUG(printPlans(dbgs())); if (MaxVF == 1) @@ -6616,9 +6677,9 @@ // 1. Create a new empty loop. Unlink the old loop and connect the new one. VPCallbackILV CallbackILV(ILV); - VPTransformState State{BestVF, BestUF, LI, - DT, ILV.Builder, ILV.VectorLoopValueMap, - &ILV, CallbackILV}; + VPTransformState State{ + BestVF, BestUF, LI, DT, TTI, ILV.Builder, ILV.VectorLoopValueMap, + &ILV, CallbackILV}; State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); State.TripCount = ILV.getOrCreateTripCount(nullptr); State.CanonicalIV = ILV.Induction; @@ -7115,7 +7176,6 @@ if ((Recipe = tryToOptimizeInductionPHI(Phi))) return Recipe; return new VPWidenPHIRecipe(Phi); - return new VPWidenPHIRecipe(Phi); } if (isa(Instr) && @@ -7221,6 +7281,17 @@ RecipeBuilder.recordRecipeOf(Entry.first); RecipeBuilder.recordRecipeOf(Entry.second); } + for (auto &Reduction : Legal->getReductionVars()) { + RecurrenceDescriptor &RdxDesc = Reduction.second; + if (CM.useInloopReductions(Reduction.first)) { + PHINode *Phi = Reduction.first; + SmallVector ReductionOperations = + RdxDesc.getReductionOpChain(Phi, OrigLoop); + for (auto &R : ReductionOperations) + RecipeBuilder.recordRecipeOf(R); + RecipeBuilder.recordRecipeOf(Phi); + } + } // For each interleave group which is relevant for this (possibly trimmed) // Range, add it to the set of groups to be later applied to the VPlan and add @@ -7334,12 +7405,46 @@ } } + // Adjust the recipes for any inloop reductions. The chain of instructions + // leading from the loop exit instr to the phi need to be converted to + // reductions, with one operand being vector and the other being the scalar + // reduction chain. + if (Range.Start > 1) { + for (auto &Reduction : Legal->getReductionVars()) { + PHINode *Phi = Reduction.first; + RecurrenceDescriptor &RdxDesc = Reduction.second; + if (!CM.useInloopReductions(Phi)) + continue; + + SmallVector ReductionOperations = + RdxDesc.getReductionOpChain(Phi, OrigLoop); + + Instruction *Chain = Phi; + for (Instruction *R : ReductionOperations) { + VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R); + assert(WidenRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSC); + unsigned ChainOpId = R->getOperand(0) == Chain ? 0 : 1; + assert(R->getOperand(ChainOpId) == Chain); + VPValue *ChainOp = Plan->getVPValue(R->getOperand(ChainOpId)); + VPValue *VecOp = Plan->getVPValue(R->getOperand(1 - ChainOpId)); + + VPReductionRecipe *RedRecipe = + new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, Legal->hasFunNoNaNAttr()); + WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator()); + WidenRecipe->removeFromParent(); + Chain = R; + } + } + } + // Finally, if tail is folded by masking, introduce selects between the phi // and the live-out instruction of each reduction, at the end of the latch. - if (CM.foldTailByMasking()) { + if (CM.foldTailByMasking() && CM.hasOuterloopReductions()) { Builder.setInsertPoint(VPBB); auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); for (auto &Reduction : Legal->getReductionVars()) { + if (CM.useInloopReductions(Reduction.first)) + continue; VPValue *Phi = Plan->getVPValue(Reduction.first); VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr()); Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); @@ -7494,6 +7599,19 @@ State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getMask()); } +void VPReductionRecipe::execute(VPTransformState &State) { + assert(!State.Instance && "Reduction being replicated."); + for (unsigned Part = 0; Part < State.UF; ++Part) { + Value *NewVecOp = State.get(VecOp, Part); + Value *NewRed = createTargetReduction(State.Builder, State.TTI, *RdxDesc, + NewVecOp, NoNaN); + Value *NewChain = State.get(ChainOp, Part); + Value *NewAcc = State.Builder.CreateBinOp( + (Instruction::BinaryOps)I->getOpcode(), NewRed, NewChain); + State.ValueMap.setVectorValue(I, Part, NewAcc); + } +} + void VPReplicateRecipe::execute(VPTransformState &State) { if (State.Instance) { // Generate a single instance. State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated); Index: llvm/lib/Transforms/Vectorize/VPlan.h =================================================================== --- llvm/lib/Transforms/Vectorize/VPlan.h +++ llvm/lib/Transforms/Vectorize/VPlan.h @@ -40,6 +40,7 @@ #include "llvm/ADT/ilist_node.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/Transforms/Utils/LoopUtils.h" #include #include #include @@ -234,9 +235,10 @@ /// needed for generating the output IR. struct VPTransformState { VPTransformState(unsigned VF, unsigned UF, LoopInfo *LI, DominatorTree *DT, - IRBuilder<> &Builder, VectorizerValueMap &ValueMap, - InnerLoopVectorizer *ILV, VPCallback &Callback) - : VF(VF), UF(UF), Instance(), LI(LI), DT(DT), Builder(Builder), + const TargetTransformInfo *TTI, IRBuilder<> &Builder, + VectorizerValueMap &ValueMap, InnerLoopVectorizer *ILV, + VPCallback &Callback) + : VF(VF), UF(UF), Instance(), LI(LI), DT(DT), TTI(TTI), Builder(Builder), ValueMap(ValueMap), ILV(ILV), Callback(Callback) {} /// The chosen Vectorization and Unroll Factors of the loop being vectorized. @@ -317,6 +319,9 @@ /// Hold a pointer to Dominator Tree to register new basic blocks in the loop. DominatorTree *DT; + /// Hold a pointer to the TargetTransformInfo + const TargetTransformInfo *TTI; + /// Hold a reference to the IRBuilder used to generate output IR code. IRBuilder<> &Builder; @@ -608,6 +613,7 @@ VPInstructionSC, VPInterleaveSC, VPPredInstPHISC, + VPReductionSC, VPReplicateSC, VPWidenCallSC, VPWidenCanonicalIVSC, @@ -1013,6 +1019,36 @@ const InterleaveGroup *getInterleaveGroup() { return IG; } }; +/// VPReductionRecipe +class VPReductionRecipe : public VPRecipeBase { +private: + RecurrenceDescriptor *RdxDesc; + Instruction *I; + VPValue *ChainOp; + VPValue *VecOp; + bool NoNaN; + +public: + VPReductionRecipe(RecurrenceDescriptor *R, Instruction *I, VPValue *ChainOp, + VPValue *VecOp, bool NoNaN) + : VPRecipeBase(VPReductionSC), RdxDesc(R), I(I), ChainOp(ChainOp), + VecOp(VecOp), NoNaN(NoNaN) {} + + ~VPReductionRecipe() override = default; + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPRecipeBase *V) { + return V->getVPRecipeID() == VPRecipeBase::VPReductionSC; + } + + /// Generate the reduction in the loop + void execute(VPTransformState &State) override; + + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +}; + /// VPReplicateRecipe replicates a given instruction producing multiple scalar /// copies of the original scalar type, one per lane, instead of producing a /// single copy of widened type for all lanes. If the instruction is known to be Index: llvm/lib/Transforms/Vectorize/VPlan.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/VPlan.cpp +++ llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -781,6 +781,14 @@ O << "\\l\""; } +void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << " +\n" + << Indent << "\"" + << "REDUCE " << VPlanIngredient(I) << " with ChainOp " << ChainOp + << "\\l\""; +} + void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << " +\n" Index: llvm/lib/Transforms/Vectorize/VPlanValue.h =================================================================== --- llvm/lib/Transforms/Vectorize/VPlanValue.h +++ llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -44,6 +44,7 @@ friend class VPBasicBlock; friend class VPInterleavedAccessInfo; friend class VPSlotTracker; + friend class LoopVectorizationPlanner; const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast). Index: llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll +++ llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll @@ -39,8 +39,7 @@ ; CHECK-NEXT: "loop:\n" + ; CHECK-NEXT: "WIDEN-INDUCTION %iv = phi 0, %iv.next\l" + ; CHECK-NEXT: "WIDEN\l"" %cond0 = icmp %iv, 13\l" + -; CHECK-NEXT: "WIDEN-SELECT%s = select %cond0, 10, 20\l" + -; CHECK-NEXT: "EMIT vp<%1> = icmp ule ir<%iv> vp<%0>\l" +; CHECK-NEXT: "WIDEN-SELECT%s = select %cond0, 10, 20\l" ; CHECK-NEXT: ] define void @test() { entry: Index: llvm/test/Transforms/LoopVectorize/reduction-inloop.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/reduction-inloop.ll +++ llvm/test/Transforms/LoopVectorize/reduction-inloop.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -force-reduction-intrinsics -dce -instcombine -S | FileCheck %s +; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -force-inloop-reductions -force-reduction-intrinsics -dce -instcombine -S | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" @@ -18,7 +18,7 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND2:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* @@ -26,38 +26,40 @@ ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP8]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[VEC_PHI]], [[VEC_IND2]] -; CHECK-NEXT: [[TMP10:%.*]] = add <4 x i32> [[TMP9]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP11]] = add <4 x i32> [[TMP10]], [[WIDE_LOAD1]] +; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[VEC_IND2]]) +; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[TMP9]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP11]], [[TMP10]] +; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD1]]) +; CHECK-NEXT: [[TMP14]] = add i32 [[TMP13]], [[TMP12]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], -; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 ; CHECK: middle.block: -; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP11]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[DOTLR_PH_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP13]], [[MIDDLE_BLOCK]] ], [ 0, [[DOTLR_PH_PREHEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ 0, [[DOTLR_PH_PREHEADER]] ] ; CHECK-NEXT: br label [[DOTLR_PH:%.*]] ; CHECK: .lr.ph: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[DOTLR_PH]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[SUM_02:%.*]] = phi i32 [ [[TMP21:%.*]], [[DOTLR_PH]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[SUM_02:%.*]] = phi i32 [ [[TMP23:%.*]], [[DOTLR_PH]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4 -; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[INDVARS_IV]] to i32 -; CHECK-NEXT: [[TMP19:%.*]] = add i32 [[SUM_02]], [[TMP18]] -; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP19]], [[TMP15]] -; CHECK-NEXT: [[TMP21]] = add i32 [[TMP20]], [[TMP17]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP19:%.*]] = load i32, i32* [[TMP18]], align 4 +; CHECK-NEXT: [[TMP20:%.*]] = trunc i64 [[INDVARS_IV]] to i32 +; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[SUM_02]], [[TMP20]] +; CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP21]], [[TMP17]] +; CHECK-NEXT: [[TMP23]] = add i32 [[TMP22]], [[TMP19]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], !llvm.loop !2 ; CHECK: ._crit_edge.loopexit: -; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi i32 [ [[TMP21]], [[DOTLR_PH]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi i32 [ [[TMP23]], [[DOTLR_PH]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[DOT_CRIT_EDGE]] ; CHECK: ._crit_edge: ; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[DOTLCSSA]], [[DOT_CRIT_EDGE_LOOPEXIT]] ] @@ -102,7 +104,7 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 1, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND2:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* @@ -110,38 +112,40 @@ ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP8]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = mul <4 x i32> [[VEC_PHI]], [[VEC_IND2]] -; CHECK-NEXT: [[TMP10:%.*]] = mul <4 x i32> [[TMP9]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP11]] = mul <4 x i32> [[TMP10]], [[WIDE_LOAD1]] +; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> [[VEC_IND2]]) +; CHECK-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP12:%.*]] = mul i32 [[TMP11]], [[TMP10]] +; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> [[WIDE_LOAD1]]) +; CHECK-NEXT: [[TMP14]] = mul i32 [[TMP13]], [[TMP12]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], -; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4 +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4 ; CHECK: middle.block: -; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> [[TMP11]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[DOTLR_PH_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP13]], [[MIDDLE_BLOCK]] ], [ 1, [[DOTLR_PH_PREHEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ 1, [[DOTLR_PH_PREHEADER]] ] ; CHECK-NEXT: br label [[DOTLR_PH:%.*]] ; CHECK: .lr.ph: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[DOTLR_PH]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[PROD_02:%.*]] = phi i32 [ [[TMP21:%.*]], [[DOTLR_PH]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[PROD_02:%.*]] = phi i32 [ [[TMP23:%.*]], [[DOTLR_PH]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4 -; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[INDVARS_IV]] to i32 -; CHECK-NEXT: [[TMP19:%.*]] = mul i32 [[PROD_02]], [[TMP18]] -; CHECK-NEXT: [[TMP20:%.*]] = mul i32 [[TMP19]], [[TMP15]] -; CHECK-NEXT: [[TMP21]] = mul i32 [[TMP20]], [[TMP17]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP19:%.*]] = load i32, i32* [[TMP18]], align 4 +; CHECK-NEXT: [[TMP20:%.*]] = trunc i64 [[INDVARS_IV]] to i32 +; CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[PROD_02]], [[TMP20]] +; CHECK-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP17]] +; CHECK-NEXT: [[TMP23]] = mul i32 [[TMP22]], [[TMP19]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], !llvm.loop !5 ; CHECK: ._crit_edge.loopexit: -; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi i32 [ [[TMP21]], [[DOTLR_PH]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi i32 [ [[TMP23]], [[DOTLR_PH]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[DOT_CRIT_EDGE]] ; CHECK: ._crit_edge: ; CHECK-NEXT: [[PROD_0_LCSSA:%.*]] = phi i32 [ 1, [[TMP0:%.*]] ], [ [[DOTLCSSA]], [[DOT_CRIT_EDGE_LOOPEXIT]] ] @@ -186,7 +190,7 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND2:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* @@ -195,14 +199,15 @@ ; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP8]], align 4 ; CHECK-NEXT: [[TMP9:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP10:%.*]] = add <4 x i32> [[VEC_PHI]], [[VEC_IND2]] -; CHECK-NEXT: [[TMP11]] = add <4 x i32> [[TMP10]], [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[VEC_IND2]]) +; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP9]]) +; CHECK-NEXT: [[TMP13]] = add i32 [[TMP12]], [[TMP11]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], -; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !6 +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !6 ; CHECK: middle.block: -; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP11]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -211,21 +216,21 @@ ; CHECK-NEXT: br label [[DOTLR_PH:%.*]] ; CHECK: .lr.ph: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[DOTLR_PH]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[SUM_02:%.*]] = phi i32 [ [[TMP21:%.*]], [[DOTLR_PH]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4 -; CHECK-NEXT: [[TMP18:%.*]] = mul nsw i32 [[TMP17]], [[TMP15]] -; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[INDVARS_IV]] to i32 -; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[SUM_02]], [[TMP19]] -; CHECK-NEXT: [[TMP21]] = add i32 [[TMP20]], [[TMP18]] +; CHECK-NEXT: [[SUM_02:%.*]] = phi i32 [ [[TMP22:%.*]], [[DOTLR_PH]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4 +; CHECK-NEXT: [[TMP19:%.*]] = mul nsw i32 [[TMP18]], [[TMP16]] +; CHECK-NEXT: [[TMP20:%.*]] = trunc i64 [[INDVARS_IV]] to i32 +; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[SUM_02]], [[TMP20]] +; CHECK-NEXT: [[TMP22]] = add i32 [[TMP21]], [[TMP19]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], !llvm.loop !7 ; CHECK: ._crit_edge.loopexit: -; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi i32 [ [[TMP21]], [[DOTLR_PH]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi i32 [ [[TMP22]], [[DOTLR_PH]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[DOT_CRIT_EDGE]] ; CHECK: ._crit_edge: ; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[DOTLCSSA]], [[DOT_CRIT_EDGE_LOOPEXIT]] ] @@ -270,20 +275,21 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 19, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP8]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = mul <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP10]] = mul <4 x i32> [[TMP9]], [[WIDE_LOAD1]] +; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> [[WIDE_LOAD1]]) +; CHECK-NEXT: [[TMP12]] = mul i32 [[TMP11]], [[TMP10]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !8 +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !8 ; CHECK: middle.block: -; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> [[TMP10]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -292,19 +298,19 @@ ; CHECK-NEXT: br label [[DOTLR_PH:%.*]] ; CHECK: .lr.ph: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[DOTLR_PH]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[SUM_02:%.*]] = phi i32 [ [[TMP18:%.*]], [[DOTLR_PH]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4 -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4 -; CHECK-NEXT: [[TMP17:%.*]] = mul i32 [[SUM_02]], [[TMP14]] -; CHECK-NEXT: [[TMP18]] = mul i32 [[TMP17]], [[TMP16]] +; CHECK-NEXT: [[SUM_02:%.*]] = phi i32 [ [[TMP19:%.*]], [[DOTLR_PH]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4 +; CHECK-NEXT: [[TMP18:%.*]] = mul i32 [[SUM_02]], [[TMP15]] +; CHECK-NEXT: [[TMP19]] = mul i32 [[TMP18]], [[TMP17]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], !llvm.loop !9 ; CHECK: ._crit_edge.loopexit: -; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi i32 [ [[TMP18]], [[DOTLR_PH]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi i32 [ [[TMP19]], [[DOTLR_PH]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[DOT_CRIT_EDGE]] ; CHECK: ._crit_edge: ; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[DOTLCSSA]], [[DOT_CRIT_EDGE_LOOPEXIT]] ] @@ -348,7 +354,7 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 120, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[IN:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4 @@ -356,17 +362,17 @@ ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4 ; CHECK-NEXT: [[TMP7:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP8]] = add <4 x i32> [[TMP7]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP7]]) +; CHECK-NEXT: [[TMP9]] = add i32 [[TMP8]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !10 +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !10 ; CHECK: middle.block: -; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP8]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 120, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ 120, [[FOR_BODY_PREHEADER]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -382,7 +388,7 @@ ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !11 ; CHECK: for.end.loopexit: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_END]] ; CHECK: for.end: ; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 120, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_END_LOOPEXIT]] ] @@ -427,20 +433,21 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ -1, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = and <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP8]] = and <4 x i32> [[TMP7]], [[WIDE_LOAD1]] +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP8:%.*]] = and i32 [[TMP7]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> [[WIDE_LOAD1]]) +; CHECK-NEXT: [[TMP10]] = and i32 [[TMP9]], [[TMP8]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !12 +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !12 ; CHECK: middle.block: -; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> [[TMP8]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -451,11 +458,11 @@ ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[RESULT_08:%.*]] = phi i32 [ [[AND:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = and i32 [[RESULT_08]], [[TMP11]] -; CHECK-NEXT: [[AND]] = and i32 [[ADD]], [[TMP12]] +; CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = and i32 [[RESULT_08]], [[TMP12]] +; CHECK-NEXT: [[AND]] = and i32 [[ADD]], [[TMP13]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] @@ -506,7 +513,7 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4 @@ -514,17 +521,17 @@ ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4 ; CHECK-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP8]] = or <4 x i32> [[TMP7]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> [[TMP7]]) +; CHECK-NEXT: [[TMP9]] = or i32 [[TMP8]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !14 +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !14 ; CHECK: middle.block: -; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> [[TMP8]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -540,7 +547,7 @@ ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !15 ; CHECK: for.end.loopexit: -; CHECK-NEXT: [[OR_LCSSA:%.*]] = phi i32 [ [[OR]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[OR_LCSSA:%.*]] = phi i32 [ [[OR]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_END]] ; CHECK: for.end: ; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OR_LCSSA]], [[FOR_END_LOOPEXIT]] ] @@ -585,7 +592,7 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4 @@ -593,17 +600,17 @@ ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4 ; CHECK-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP8]] = xor <4 x i32> [[TMP7]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> [[TMP7]]) +; CHECK-NEXT: [[TMP9]] = xor i32 [[TMP8]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !16 +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !16 ; CHECK: middle.block: -; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> [[TMP8]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -619,7 +626,7 @@ ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !17 ; CHECK: for.end.loopexit: -; CHECK-NEXT: [[XOR_LCSSA:%.*]] = phi i32 [ [[XOR]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[XOR_LCSSA:%.*]] = phi i32 [ [[XOR]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_END]] ; CHECK: for.end: ; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[XOR_LCSSA]], [[FOR_END_LOOPEXIT]] ] @@ -664,20 +671,21 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[TMP3]] to <4 x float>* ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[TMP5]] to <4 x float>* ; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP6]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <4 x float> [[VEC_PHI]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP8]] = fadd fast <4 x float> [[TMP7]], [[WIDE_LOAD1]] +; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP8:%.*]] = fadd float [[TMP7]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[WIDE_LOAD1]]) +; CHECK-NEXT: [[TMP10]] = fadd float [[TMP9]], [[TMP8]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !18 +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !18 ; CHECK: middle.block: -; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP8]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -688,11 +696,11 @@ ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[RESULT_08:%.*]] = phi float [ [[FADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP11:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP12:%.*]] = load float, float* [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP12:%.*]] = load float, float* [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[RESULT_08]], [[TMP11]] -; CHECK-NEXT: [[FADD]] = fadd fast float [[ADD]], [[TMP12]] +; CHECK-NEXT: [[TMP13:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[RESULT_08]], [[TMP12]] +; CHECK-NEXT: [[FADD]] = fadd fast float [[ADD]], [[TMP13]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] @@ -743,20 +751,21 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ , [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[TMP3]] to <4 x float>* ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[TMP5]] to <4 x float>* ; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP6]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = fmul fast <4 x float> [[VEC_PHI]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP8]] = fmul fast <4 x float> [[TMP7]], [[WIDE_LOAD1]] +; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.000000e+00, <4 x float> [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP8:%.*]] = fmul float [[TMP7]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.000000e+00, <4 x float> [[WIDE_LOAD1]]) +; CHECK-NEXT: [[TMP10]] = fmul float [[TMP9]], [[TMP8]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !20 +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !20 ; CHECK: middle.block: -; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.000000e+00, <4 x float> [[TMP8]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -767,11 +776,11 @@ ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[RESULT_08:%.*]] = phi float [ [[FMUL:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP11:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP12:%.*]] = load float, float* [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP12:%.*]] = load float, float* [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = fmul fast float [[RESULT_08]], [[TMP11]] -; CHECK-NEXT: [[FMUL]] = fmul fast float [[ADD]], [[TMP12]] +; CHECK-NEXT: [[TMP13:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = fmul fast float [[RESULT_08]], [[TMP12]] +; CHECK-NEXT: [[FMUL]] = fmul fast float [[ADD]], [[TMP13]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] @@ -1129,7 +1138,7 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND2:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* @@ -1137,39 +1146,41 @@ ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP8]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[VEC_PHI]], [[VEC_IND2]] -; CHECK-NEXT: [[TMP10:%.*]] = add <4 x i32> [[TMP9]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP11]] = add <4 x i32> [[TMP10]], [[WIDE_LOAD1]] +; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[VEC_IND2]]) +; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[TMP9]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP11]], [[TMP10]] +; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD1]]) +; CHECK-NEXT: [[TMP14]] = add i32 [[TMP13]], [[TMP12]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], -; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !30 +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !30 ; CHECK: middle.block: -; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP11]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[DOTLR_PH_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP13]], [[MIDDLE_BLOCK]] ], [ 0, [[DOTLR_PH_PREHEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ 0, [[DOTLR_PH_PREHEADER]] ] ; CHECK-NEXT: br label [[DOTLR_PH:%.*]] ; CHECK: .lr.ph: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[DOTLR_PH]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[SUM_02:%.*]] = phi i32 [ [[TMP21:%.*]], [[DOTLR_PH]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[SUM_02:%.*]] = phi i32 [ [[TMP23:%.*]], [[DOTLR_PH]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4 -; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[INDVARS_IV]] to i32 -; CHECK-NEXT: [[TMP19:%.*]] = add i32 [[SUM_02]], [[TMP18]] -; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP19]], [[TMP15]] -; CHECK-NEXT: [[TMP21]] = add i32 [[TMP20]], [[TMP17]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP19:%.*]] = load i32, i32* [[TMP18]], align 4 +; CHECK-NEXT: [[TMP20:%.*]] = trunc i64 [[INDVARS_IV]] to i32 +; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[SUM_02]], [[TMP20]] +; CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP21]], [[TMP17]] +; CHECK-NEXT: [[TMP23]] = add i32 [[TMP22]], [[TMP19]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop !31 ; CHECK: ._crit_edge: -; CHECK-NEXT: [[SUM_LCSSA:%.*]] = phi i32 [ [[TMP21]], [[DOTLR_PH]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] -; CHECK-NEXT: [[SUM_COPY:%.*]] = phi i32 [ [[TMP21]], [[DOTLR_PH]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[SUM_LCSSA:%.*]] = phi i32 [ [[TMP23]], [[DOTLR_PH]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[SUM_COPY:%.*]] = phi i32 [ [[TMP23]], [[DOTLR_PH]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[END]] ; CHECK: end: ; CHECK-NEXT: [[F1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[SUM_LCSSA]], [[DOT_CRIT_EDGE]] ]